{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Imports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pprint import pprint\n",
    "from IPython.display import display\n",
    "from hamilton import driver\n",
    "\n",
    "import __init__ as webscraper"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Note: Hamilton collects completely anonymous data about usage. This will help us improve Hamilton over time. See https://github.com/apache/hamilton#usage-analytics--data-privacy for details.\n"
     ]
    },
    {
     "data": {
      "image/svg+xml": [
       "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n",
       "<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n",
       " \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n",
       "<!-- Generated by graphviz version 2.43.0 (0)\n",
       " -->\n",
       "<!-- Title: %3 Pages: 1 -->\n",
       "<svg width=\"847pt\" height=\"474pt\"\n",
       " viewBox=\"0.00 0.00 847.00 473.50\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
       "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 469.5)\">\n",
       "<title>%3</title>\n",
       "<polygon fill=\"white\" stroke=\"transparent\" points=\"-4,4 -4,-469.5 843,-469.5 843,4 -4,4\"/>\n",
       "<g id=\"clust1\" class=\"cluster\">\n",
       "<title>cluster__legend</title>\n",
       "<polygon fill=\"none\" stroke=\"black\" points=\"8,-199.5 8,-457.5 106,-457.5 106,-199.5 8,-199.5\"/>\n",
       "<text text-anchor=\"middle\" x=\"57\" y=\"-442.3\" font-family=\"Helvetica,sans-Serif\" font-size=\"14.00\">Legend</text>\n",
       "</g>\n",
       "<!-- parsed_html -->\n",
       "<g id=\"node1\" class=\"node\">\n",
       "<title>parsed_html</title>\n",
       "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M589,-169.5C589,-169.5 494,-169.5 494,-169.5 488,-169.5 482,-163.5 482,-157.5 482,-157.5 482,-117.5 482,-117.5 482,-111.5 488,-105.5 494,-105.5 494,-105.5 589,-105.5 589,-105.5 595,-105.5 601,-111.5 601,-117.5 601,-117.5 601,-157.5 601,-157.5 601,-163.5 595,-169.5 589,-169.5\"/>\n",
       "<text text-anchor=\"start\" x=\"493\" y=\"-148.3\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">parsed_html</text>\n",
       "<text text-anchor=\"start\" x=\"493.5\" y=\"-120.3\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">ParsingResult</text>\n",
       "</g>\n",
       "<!-- parsed_html_collection -->\n",
       "<g id=\"node4\" class=\"node\">\n",
       "<title>parsed_html_collection</title>\n",
       "<path fill=\"#b4d8e4\" stroke=\"#ea5556\" d=\"M823,-169.5C823,-169.5 646,-169.5 646,-169.5 640,-169.5 634,-163.5 634,-157.5 634,-157.5 634,-117.5 634,-117.5 634,-111.5 640,-105.5 646,-105.5 646,-105.5 823,-105.5 823,-105.5 829,-105.5 835,-111.5 835,-117.5 835,-117.5 835,-157.5 835,-157.5 835,-163.5 829,-169.5 823,-169.5\"/>\n",
       "<path fill=\"none\" stroke=\"#ea5556\" d=\"M827,-173.5C827,-173.5 642,-173.5 642,-173.5 636,-173.5 630,-167.5 630,-161.5 630,-161.5 630,-113.5 630,-113.5 630,-107.5 636,-101.5 642,-101.5 642,-101.5 827,-101.5 827,-101.5 833,-101.5 839,-107.5 839,-113.5 839,-113.5 839,-161.5 839,-161.5 839,-167.5 833,-173.5 827,-173.5\"/>\n",
       "<text text-anchor=\"start\" x=\"645\" y=\"-148.3\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">parsed_html_collection</text>\n",
       "<text text-anchor=\"start\" x=\"722\" y=\"-120.3\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">List</text>\n",
       "</g>\n",
       "<!-- parsed_html&#45;&gt;parsed_html_collection -->\n",
       "<g id=\"edge6\" class=\"edge\">\n",
       "<title>parsed_html&#45;&gt;parsed_html_collection</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M611.23,-137.5C614.09,-137.5 616.99,-137.5 619.92,-137.5\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"611.1,-137.5 601.1,-133 606.1,-137.5 601.1,-137.5 601.1,-137.5 601.1,-137.5 606.1,-137.5 601.1,-142 611.1,-137.5 611.1,-137.5\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"619.94,-141 629.94,-137.5 619.94,-134 619.94,-141\"/>\n",
       "</g>\n",
       "<!-- url -->\n",
       "<g id=\"node2\" class=\"node\">\n",
       "<title>url</title>\n",
       "<path fill=\"#b4d8e4\" stroke=\"#56e39f\" d=\"M233,-199.5C233,-199.5 144,-199.5 144,-199.5 138,-199.5 132,-193.5 132,-187.5 132,-187.5 132,-147.5 132,-147.5 132,-141.5 138,-135.5 144,-135.5 144,-135.5 233,-135.5 233,-135.5 239,-135.5 245,-141.5 245,-147.5 245,-147.5 245,-187.5 245,-187.5 245,-193.5 239,-199.5 233,-199.5\"/>\n",
       "<path fill=\"none\" stroke=\"#56e39f\" d=\"M237,-203.5C237,-203.5 140,-203.5 140,-203.5 134,-203.5 128,-197.5 128,-191.5 128,-191.5 128,-143.5 128,-143.5 128,-137.5 134,-131.5 140,-131.5 140,-131.5 237,-131.5 237,-131.5 243,-131.5 249,-137.5 249,-143.5 249,-143.5 249,-191.5 249,-191.5 249,-197.5 243,-203.5 237,-203.5\"/>\n",
       "<text text-anchor=\"start\" x=\"177.5\" y=\"-178.3\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">url</text>\n",
       "<text text-anchor=\"start\" x=\"143\" y=\"-150.3\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">Parallelizable</text>\n",
       "</g>\n",
       "<!-- url&#45;&gt;parsed_html -->\n",
       "<g id=\"edge1\" class=\"edge\">\n",
       "<title>url&#45;&gt;parsed_html</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M249.14,-178.69C303.06,-186.75 384.29,-193.8 453,-178.5 459.49,-177.05 466.07,-175.07 472.55,-172.76\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"472.63,-172.73 483.58,-173.35 477.3,-170.94 481.96,-169.15 481.96,-169.15 481.96,-169.15 477.3,-170.94 480.35,-164.95 472.63,-172.73 472.63,-172.73\"/>\n",
       "</g>\n",
       "<!-- html_page -->\n",
       "<g id=\"node3\" class=\"node\">\n",
       "<title>html_page</title>\n",
       "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M405.5,-169.5C405.5,-169.5 325.5,-169.5 325.5,-169.5 319.5,-169.5 313.5,-163.5 313.5,-157.5 313.5,-157.5 313.5,-117.5 313.5,-117.5 313.5,-111.5 319.5,-105.5 325.5,-105.5 325.5,-105.5 405.5,-105.5 405.5,-105.5 411.5,-105.5 417.5,-111.5 417.5,-117.5 417.5,-117.5 417.5,-157.5 417.5,-157.5 417.5,-163.5 411.5,-169.5 405.5,-169.5\"/>\n",
       "<text text-anchor=\"start\" x=\"324.5\" y=\"-148.3\" font-family=\"Helvetica,sans-Serif\" font-weight=\"bold\" font-size=\"14.00\">html_page</text>\n",
       "<text text-anchor=\"start\" x=\"356\" y=\"-120.3\" font-family=\"Helvetica,sans-Serif\" font-style=\"italic\" font-size=\"14.00\">str</text>\n",
       "</g>\n",
       "<!-- url&#45;&gt;html_page -->\n",
       "<g id=\"edge5\" class=\"edge\">\n",
       "<title>url&#45;&gt;html_page</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M249.24,-157.26C266.7,-154.27 285.81,-150.99 303.36,-147.98\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"303.63,-147.93 314.25,-150.68 308.56,-147.09 313.49,-146.24 313.49,-146.24 313.49,-146.24 308.56,-147.09 312.73,-141.81 303.63,-147.93 303.63,-147.93\"/>\n",
       "</g>\n",
       "<!-- html_page&#45;&gt;parsed_html -->\n",
       "<g id=\"edge2\" class=\"edge\">\n",
       "<title>html_page&#45;&gt;parsed_html</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M417.76,-137.5C434.69,-137.5 453.79,-137.5 471.8,-137.5\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"471.96,-141 481.96,-137.5 471.96,-134 471.96,-141\"/>\n",
       "</g>\n",
       "<!-- _parsed_html_inputs -->\n",
       "<g id=\"node5\" class=\"node\">\n",
       "<title>_parsed_html_inputs</title>\n",
       "<polygon fill=\"none\" stroke=\"black\" stroke-dasharray=\"5,2\" points=\"453,-87 278,-87 278,0 453,0 453,-87\"/>\n",
       "<text text-anchor=\"start\" x=\"302.5\" y=\"-60.3\" font-family=\"Helvetica,sans-Serif\" font-size=\"14.00\">remove_lines</text>\n",
       "<text text-anchor=\"start\" x=\"408.5\" y=\"-60.3\" font-family=\"Helvetica,sans-Serif\" font-size=\"14.00\">bool</text>\n",
       "<text text-anchor=\"start\" x=\"295\" y=\"-39.3\" font-family=\"Helvetica,sans-Serif\" font-size=\"14.00\">tags_to_extract</text>\n",
       "<text text-anchor=\"start\" x=\"411\" y=\"-39.3\" font-family=\"Helvetica,sans-Serif\" font-size=\"14.00\">List</text>\n",
       "<text text-anchor=\"start\" x=\"293.5\" y=\"-18.3\" font-family=\"Helvetica,sans-Serif\" font-size=\"14.00\">tags_to_remove</text>\n",
       "<text text-anchor=\"start\" x=\"411\" y=\"-18.3\" font-family=\"Helvetica,sans-Serif\" font-size=\"14.00\">List</text>\n",
       "</g>\n",
       "<!-- _parsed_html_inputs&#45;&gt;parsed_html -->\n",
       "<g id=\"edge3\" class=\"edge\">\n",
       "<title>_parsed_html_inputs&#45;&gt;parsed_html</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M447.13,-87.06C455.75,-91.72 464.45,-96.42 472.9,-100.98\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"471.48,-104.19 481.94,-105.87 474.81,-98.03 471.48,-104.19\"/>\n",
       "</g>\n",
       "<!-- _url_inputs -->\n",
       "<g id=\"node6\" class=\"node\">\n",
       "<title>_url_inputs</title>\n",
       "<polygon fill=\"none\" stroke=\"black\" stroke-dasharray=\"5,2\" points=\"99,-190 15,-190 15,-145 99,-145 99,-190\"/>\n",
       "<text text-anchor=\"start\" x=\"30\" y=\"-163.3\" font-family=\"Helvetica,sans-Serif\" font-size=\"14.00\">urls</text>\n",
       "<text text-anchor=\"start\" x=\"63\" y=\"-163.3\" font-family=\"Helvetica,sans-Serif\" font-size=\"14.00\">list</text>\n",
       "</g>\n",
       "<!-- _url_inputs&#45;&gt;url -->\n",
       "<g id=\"edge4\" class=\"edge\">\n",
       "<title>_url_inputs&#45;&gt;url</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M99.33,-167.5C105.12,-167.5 111.19,-167.5 117.33,-167.5\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"117.68,-171 127.68,-167.5 117.68,-164 117.68,-171\"/>\n",
       "</g>\n",
       "<!-- input -->\n",
       "<g id=\"node7\" class=\"node\">\n",
       "<title>input</title>\n",
       "<polygon fill=\"none\" stroke=\"black\" stroke-dasharray=\"5,2\" points=\"86.5,-426 27.5,-426 27.5,-389 86.5,-389 86.5,-426\"/>\n",
       "<text text-anchor=\"middle\" x=\"57\" y=\"-403.8\" font-family=\"Helvetica,sans-Serif\" font-size=\"14.00\">input</text>\n",
       "</g>\n",
       "<!-- function -->\n",
       "<g id=\"node8\" class=\"node\">\n",
       "<title>function</title>\n",
       "<path fill=\"#b4d8e4\" stroke=\"black\" d=\"M85,-371C85,-371 29,-371 29,-371 23,-371 17,-365 17,-359 17,-359 17,-346 17,-346 17,-340 23,-334 29,-334 29,-334 85,-334 85,-334 91,-334 97,-340 97,-346 97,-346 97,-359 97,-359 97,-365 91,-371 85,-371\"/>\n",
       "<text text-anchor=\"middle\" x=\"57\" y=\"-348.8\" font-family=\"Helvetica,sans-Serif\" font-size=\"14.00\">function</text>\n",
       "</g>\n",
       "<!-- expand -->\n",
       "<g id=\"node9\" class=\"node\">\n",
       "<title>expand</title>\n",
       "<path fill=\"#b4d8e4\" stroke=\"#56e39f\" d=\"M82,-312C82,-312 32,-312 32,-312 26,-312 20,-306 20,-300 20,-300 20,-287 20,-287 20,-281 26,-275 32,-275 32,-275 82,-275 82,-275 88,-275 94,-281 94,-287 94,-287 94,-300 94,-300 94,-306 88,-312 82,-312\"/>\n",
       "<path fill=\"none\" stroke=\"#56e39f\" d=\"M86,-316C86,-316 28,-316 28,-316 22,-316 16,-310 16,-304 16,-304 16,-283 16,-283 16,-277 22,-271 28,-271 28,-271 86,-271 86,-271 92,-271 98,-277 98,-283 98,-283 98,-304 98,-304 98,-310 92,-316 86,-316\"/>\n",
       "<text text-anchor=\"middle\" x=\"57\" y=\"-289.8\" font-family=\"Helvetica,sans-Serif\" font-size=\"14.00\">expand</text>\n",
       "</g>\n",
       "<!-- collect -->\n",
       "<g id=\"node10\" class=\"node\">\n",
       "<title>collect</title>\n",
       "<path fill=\"#b4d8e4\" stroke=\"#ea5556\" d=\"M78.5,-249C78.5,-249 35.5,-249 35.5,-249 29.5,-249 23.5,-243 23.5,-237 23.5,-237 23.5,-224 23.5,-224 23.5,-218 29.5,-212 35.5,-212 35.5,-212 78.5,-212 78.5,-212 84.5,-212 90.5,-218 90.5,-224 90.5,-224 90.5,-237 90.5,-237 90.5,-243 84.5,-249 78.5,-249\"/>\n",
       "<path fill=\"none\" stroke=\"#ea5556\" d=\"M82.5,-253C82.5,-253 31.5,-253 31.5,-253 25.5,-253 19.5,-247 19.5,-241 19.5,-241 19.5,-220 19.5,-220 19.5,-214 25.5,-208 31.5,-208 31.5,-208 82.5,-208 82.5,-208 88.5,-208 94.5,-214 94.5,-220 94.5,-220 94.5,-241 94.5,-241 94.5,-247 88.5,-253 82.5,-253\"/>\n",
       "<text text-anchor=\"middle\" x=\"57\" y=\"-226.8\" font-family=\"Helvetica,sans-Serif\" font-size=\"14.00\">collect</text>\n",
       "</g>\n",
       "</g>\n",
       "</svg>\n"
      ],
      "text/plain": [
       "<graphviz.graphs.Digraph at 0x7f53adf495a0>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "dr = (\n",
    "    driver.Builder()\n",
    "    .enable_dynamic_execution(allow_experimental_mode=True)  # this allows parallel/collect nodes\n",
    "    .with_modules(webscraper)\n",
    "    .build()\n",
    ")\n",
    "\n",
    "display(dr.display_all_functions(None))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/tjean/projects/dagworks/hamilton/contrib/hamilton/contrib/user/zilto/webscraper/__init__.py:52: GuessedAtParserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system (\"lxml\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n",
      "\n",
      "The code that caused this warning is on line 52 of the file /home/tjean/projects/dagworks/hamilton/contrib/hamilton/contrib/user/zilto/webscraper/__init__.py. To get rid of this warning, pass the additional argument 'features=\"lxml\"' to the BeautifulSoup constructor.\n",
      "\n",
      "  soup = BeautifulSoup(html_page)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['parsed_html_collection']\n"
     ]
    }
   ],
   "source": [
    "final_vars = [\"parsed_html_collection\"]\n",
    "\n",
    "inputs = dict(\n",
    "    urls=[\n",
    "        \"https://blog.dagworks.io/p/llmops-production-prompt-engineering\",\n",
    "    ]\n",
    ")\n",
    "\n",
    "overrides = dict()\n",
    "\n",
    "res = dr.execute(\n",
    "    final_vars=final_vars,\n",
    "    inputs=inputs,\n",
    "    overrides=overrides\n",
    ")\n",
    "\n",
    "pprint(list(res.keys()), width=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'What you send to your LLM is quite important. Small variations and changes can have large impacts on outputs, so as your product evolves, the need to evolve your prompts will too. LLMs are also constantly being developed and released, and so as LLMs change, your prompts will also need to change. Therefore it’s important to set up an iteration pattern to operationalize how you “deploy” your prompts so you and your team can move efficiently, but also ensure that production issues are minimized, if not avoided. In this post, we’ll guide you through the best practices of managing prompts with Hamilton, making analogies to MLOps patterns, and discussing trade-offs along the way. Notes:(1): if you’re looking for a post that talks about “context management” this isn’t that post. But it is the post that will help you with the nuts and bolts on how to iterate and create that production grade “prompt context management” iteration story. (2): we’ll use prompt & prompt template interchangeably. (3): we’ll assume an “online” web-service setting is where these prompts are being used. (4): we’ll be using ourHamilton’s PDF summarizer exampleto project our patterns onto. (5): not familiar withHamilton? You can either learn about Hamilton viaTry Hamiltonand come back, or get the high level LLMOps approach from this post and then dig into Hamilton via thePDF Summarizer example. (6): what’s our credibility here? We’ve spent our careers building self-service data/MLOps tooling, most famously for Stitch Fix’s 100+ Data Scientists. So we’ve seen our share of outages and approaches play out over time. Thanks for reading DAGWorks’s Substack! Subscribe for free to receive updates and posts like this! Point:Prompts + LLM APIs are analogous to hyper-parameters + machine learning models. In terms of “Ops” practices, LLMOps is still in its infancy. MLOps is a little older, but still neither are widely adopted if you’re comparing it to how widespread knowledge is around DevOps practices. DevOps practices largely concern themselves with how you ship code to production, and MLOps practices how to ship code& data artifacts(e.g., statistical models)to production. So what about LLMOps? Personally, I think it’s closer to MLOps since you have: your LLM workflow is simply code. and an LLM API is a data artifact that can be “tweaked” using prompts, similar to a machine learning (ML) model and its hyper-parameters. Therefore, you most likely care about versioning the LLM API + prompts together tightly for good production practices. For instance, in MLOps practice,  you’d want a process in place to validate your ML model still behaves correctly whenever its hyper-parameters are changed. To be clear, the two parts to control for are theLLMand theprompts. Much like  MLOps, when the code or the model artifact changes, you want to be able to determine which did. For LLMOps, we’ll want the same discernment, separating the LLM workflow from the LLM API + prompts. Importantly, we should consider LLMs (self-hosted or APIs) to be mostly static since we less frequently update (or even control) their internals. So, changing thepromptspart of LLM API + prompts is effectively like creating a new model artifact. There are two main ways to treat prompts: Prompts as dynamic runtime variables. The template used isn’t static to a deployment. Prompts as code.The prompt template is static/ predetermined given a deployment. The main difference is the amount of moving parts you need to manage to ensure a great production story. Below, we dig into how to use Hamilton in the context of these two approaches. Prompts are just strings. Since strings are a primitive type in most languages, this means that they are quite easy to pass around.\\xa0 The idea is to abstract your code so that at runtime you pass in the prompts required.\\xa0 More concretely, you’d “load/reload” prompt templates whenever there’s an “updated” one. The MLOps analogy here, would be to auto-reload the ML model artifact (e.g., a pkl file) whenever a new model is available. The benefit here is that you can very quickly roll out new prompts because you do not need to redeploy your application! The downside to this iteration speed is increased operational burden: To someone monitoring your application, it’ll be unclear when the change occurred and whether it’s propagated itself through your systems. For example, you just pushed a new prompt, and the LLM now returns more tokens per request, causing latency to spike; whoever is monitoring will likely be puzzled, unless you have a great change log culture. Rollback semantics involve having to know aboutanothersystem. You can’t just rollback a prior deployment to fix things. You’ll need great monitoring to understand what was run and when; e.g., when customer service gives you a ticket to investigate, how do you know what prompt was in use? You’ll need to manage and monitor whatever system you’re using to manage and store your prompts. This will be an extra system you’ll need to maintain outside of whatever is serving your code. You’ll need to manage two processes, one for updating and pushing the service, and one for updating and pushing prompts. Synchronizing these changes will be on you. For example, you need to make a code change to your service to handle a new prompt. You will need to coordinate changing two systems to make it work, which is extra operational overhead to manage. Our PDF summarizer flow would look something like this if you removesummarize_text_from_summaries_promptandsummarize_chunk_of_text_promptfunction definitions: To operate things, you’ll want to either inject the prompts at request time: Or\\xa0you change your code to dynamically load prompts, i.e., add functions to retrieve prompts from an external system as part of the Hamilton dataflow. At each invocation, they will query for the prompt to use (you can of course cache this for performance): Driver code: Here we outline a few ways to monitor what went on. Log results of execution. That is run Hamilton, then emit information to wherever you want it to go. Note. In the above, Hamilton allows you to requestanyintermediateoutputs simply by requesting “functions” (i.e. nodes in the diagram) by name. If we really want to get all the intermediate outputs of the entire dataflow, we can do so and log it wherever we want to! Use loggers inside Hamilton functions (to see the power of this approach,see my old talk on structured logs): Extend Hamilton to emit this information. You use Hamilton to capture information from executed functions, i.e. nodes, without needing to insert logging statement inside the function’s body. This promotes reusability since you can toggle logging between development and production settings at the Driver level. SeeGraphAdapters, or write your ownPython decoratorto wrap functions for monitoring. In any of the above code, you could easily pull in a 3rd party tool to help track & monitor the code, as well as the external API call, e.g. data dog. Note, with a one-line code change, you can plug in the DAGWorks’s Driver and get all that monitoring you’d want and more. (Try the free tierhere)! Since prompts are simply strings, they’re also very amenable to being stored along with your source code. The idea is to store as many prompt versions as you like, within your code so that at runtime, the set of prompts available is fixed and deterministic. The MLOps analogy here is, instead of dynamically reloading models, you instead bake the ML model into the container/hard code the reference. Once deployed, your app has everything that it needs. The deployment is immutable; nothing changes once it’s up. This makes debugging & determining what’s going on, much simpler. This approach has many operational benefits: Whenever a new prompt is pushed, it forces a new deployment. Rollback semantics are clear if there’s an issue with a new prompt. You can submit a pull request (PR) for the source code and prompts at the same time. It becomes simpler to review what the change is, and the downstream dependencies of what these prompts will touch/interact with. You can add checks to your CI/CD system to ensure bad prompts don’t make it to production. It’s simpler to debug an issue. You just pull the (Docker) container that was created and you’ll be able to exactly replicate any customer issue quickly and easily. There is no other “prompt system” to maintain or manage. Simplifying operations. It doesn’t preclude adding extra monitoring and visibility. The prompts would be encoded into functions into the dataflow/directed acyclic graph (DAG): Pairing this code withgit, we have a lightweight versioning system for your entire dataflow (i.e. “chain”), so you can always discern what state the world was in, given a git commit SHA. If you want to manage and have access to multiple prompts at any given point in time, Hamilton has two powerful abstractions to enable you to do so:@config.whenandPython modules. This allows you to store and keep available all older prompt versions together and specify which one to use via code. Hamilton has a concept of decorators, which are just annotations on functions. The@config.whendecorator allows to specify alternative implementations for a functions, i.e. “node”, in your dataflow. In this case, we specify alternative prompts. You can keep adding functions annotated with@config.when, allowing you to swap between them using configuration passed to the HamiltonDriver. When instantiating theDriver,  it will construct the dataflow using the prompt implementation associated with the configuration value. Alternatively to using@config.when, you can instead place your different prompt implementations into different Python modules. Then, atDriverconstruction time, pass the correct module for the context you want to use. So here we have one module housing V1 of our prompt: Here we have one module housing V2 (see how they differ slightly): In the driver code below, we choose the right module to use based on some context. Using the module approach allows us to encapsulate and version whole sets of prompts together. If you want to go back in time (via git), or see what a blessed prompt version was, you just need to navigate to the correct commit, and then look in the right module. Assuming you’re using git to track your code, you wouldn’t need to record what prompts were being used. Instead, you’d just need to know what git commit SHA is deployed and you’ll be able to track the version of your code and prompts simultaneously. To monitor flows, just like the above approach, you have the same monitoring hooks available at your disposal, and I wont repeat them here, but they are: Request any intermediate outputs and log them yourself outside of Hamilton. Log them from within the function yourself, or build aPython decorator/GraphAdapterto do it at the framework level. Integrate 3rd party tooling for monitoring your code and LLM API calls, or use the DAGWorks Platform offering to monitor it all. (Try the free tierhere)! or all the above! With any ML initiative, it’s important to measure business impacts of changes. Likewise, with LLMs + prompts, it’ll be important to test and measure changes against important business metrics. In the MLOps world, you’d be A/B testing ML models to evaluate their business value by dividing traffic between them. To ensure the randomness necessary to A/B tests, you wouldn’t know at runtime which model to use until a coin is flipped. However, to get those models out, they both would have follow a process to qualify them. So for prompts, we should think similarly. The above two prompt engineering patterns don’t preclude you from being able to A/B test prompts, but it means you need to manage a process to enable however many prompt templates you’re testing in parallel. If you’re also adjusting code paths, having them in code will be simpler to discern and debug what is going on, and you can make use of the `@config.when` decorator / python module swapping for this purpose. Versus, having to critically rely on your logging/monitoring/observability stack to tell you what prompt was used if you’re dynamically loading/passing them in and then having to mentally map which prompts go with which code paths. Note, this all gets harder if you start needing to change multiple prompts for an A/B test because you have several of them in a flow. For example you have two prompts in your workflow and you’re changing LLMs, you’ll want to A/B test the change holistically, rather than individually per prompt. Our advice, by putting the prompts into code your operational life will be simpler, since you’ll know what two prompts belong to what code paths without having to do any mental mapping. Thank you for reading DAGWorks’s Substack. This post is public so feel free to share it. Share In this post, we explained two patterns for managing prompts in a production environment with Hamilton. The first approach treatsprompts asdynamic runtime variables,while the second, treatsprompts as codefor production settings. If you value reducing operational burden, then our advice is to encode prompts as code, as it is operationally simpler, unless the speed to change them really matters for you. To recap: Prompts as dynamic runtime variables. Use an external system to pass the prompts to your Hamilton dataflows, or use Hamilton to pull them from a DB. For debugging & monitoring, it’s important to be able to determine what prompt was used for a given invocation. You can integrate open source tools, or use something like the DAGWorks Platform to help ensure you know what was used for any invocation of your code. Prompts as code.Encoding the prompts as code allows easy versioning with git. Change management can be done via pull requests and CI/CD checks. It works well with Hamilton’s features like@config.whenand module switching at the Driver level because it determines clearly what version of the prompt is used. This approach strengthens the use of any tooling you might use to monitor or track, like the DAGWorks Platform, as prompts for a deployment are immutable.  If you’re excited by any of this, or have strong opinions, leave a comment, or drop by our Slack channel! Some links to do praise/complain/chat: 📣join our community on Slack—\\u200awe’re more than happy to help answer questions you might have or get you started. ⭐️ us onGitHub. 📝 leave us anissueif you find something. 📚 read ourdocumentation. ⌨️ interactivelylearn about Hamilton in your browser. We have a growing collection of posts & content. Here are some we think you might be interested in. Skip learning convoluted LLM-specific frameworks and write your first LLM application using regular Python functions and Hamilton! In this post, we’ll present a containerized PDF summarizer powered by the OpenAI API. Its flow is encoded in Hamilton, which the In this post, we’re going to share how Hamilton can help you write modular and maintainable code for your large language model (LLM) application stack. Hamilton is great for describing any type of dataflow, which is exactly what you’re doing when building an LLM powered application. With Hamilton you get strong tryhamilton.dev– an interactive tutorial in your browser! Hamilton + Airflow(GitHub repo) Hamilton + Feast(GitHub repo) Pandas data transformations in Hamilton in 5 minutes Lineage + Hamilton in 10 minutes No posts Ready for more? your LLM workflow is simply code. and an LLM API is a data artifact that can be “tweaked” using prompts, similar to a machine learning (ML) model and its hyper-parameters. Prompts as dynamic runtime variables. The template used isn’t static to a deployment. Prompts as code.The prompt template is static/ predetermined given a deployment. To someone monitoring your application, it’ll be unclear when the change occurred and whether it’s propagated itself through your systems. For example, you just pushed a new prompt, and the LLM now returns more tokens per request, causing latency to spike; whoever is monitoring will likely be puzzled, unless you have a great change log culture. Rollback semantics involve having to know aboutanothersystem. You can’t just rollback a prior deployment to fix things. You’ll need great monitoring to understand what was run and when; e.g., when customer service gives you a ticket to investigate, how do you know what prompt was in use? You’ll need to manage and monitor whatever system you’re using to manage and store your prompts. This will be an extra system you’ll need to maintain outside of whatever is serving your code. You’ll need to manage two processes, one for updating and pushing the service, and one for updating and pushing prompts. Synchronizing these changes will be on you. For example, you need to make a code change to your service to handle a new prompt. You will need to coordinate changing two systems to make it work, which is extra operational overhead to manage. Log results of execution. That is run Hamilton, then emit information to wherever you want it to go. Use loggers inside Hamilton functions (to see the power of this approach,see my old talk on structured logs): Extend Hamilton to emit this information. You use Hamilton to capture information from executed functions, i.e. nodes, without needing to insert logging statement inside the function’s body. This promotes reusability since you can toggle logging between development and production settings at the Driver level. SeeGraphAdapters, or write your ownPython decoratorto wrap functions for monitoring. Whenever a new prompt is pushed, it forces a new deployment. Rollback semantics are clear if there’s an issue with a new prompt. You can submit a pull request (PR) for the source code and prompts at the same time. It becomes simpler to review what the change is, and the downstream dependencies of what these prompts will touch/interact with. You can add checks to your CI/CD system to ensure bad prompts don’t make it to production. It’s simpler to debug an issue. You just pull the (Docker) container that was created and you’ll be able to exactly replicate any customer issue quickly and easily. There is no other “prompt system” to maintain or manage. Simplifying operations. It doesn’t preclude adding extra monitoring and visibility. Request any intermediate outputs and log them yourself outside of Hamilton. Log them from within the function yourself, or build aPython decorator/GraphAdapterto do it at the framework level. Integrate 3rd party tooling for monitoring your code and LLM API calls, or use the DAGWorks Platform offering to monitor it all. (Try the free tierhere)! or all the above! Prompts as dynamic runtime variables. Use an external system to pass the prompts to your Hamilton dataflows, or use Hamilton to pull them from a DB. For debugging & monitoring, it’s important to be able to determine what prompt was used for a given invocation. You can integrate open source tools, or use something like the DAGWorks Platform to help ensure you know what was used for any invocation of your code. Prompts as code.Encoding the prompts as code allows easy versioning with git. Change management can be done via pull requests and CI/CD checks. It works well with Hamilton’s features like@config.whenand module switching at the Driver level because it determines clearly what version of the prompt is used. This approach strengthens the use of any tooling you might use to monitor or track, like the DAGWorks Platform, as prompts for a deployment are immutable. 📣join our community on Slack—\\u200awe’re more than happy to help answer questions you might have or get you started. ⭐️ us onGitHub. 📝 leave us anissueif you find something. 📚 read ourdocumentation. ⌨️ interactivelylearn about Hamilton in your browser. tryhamilton.dev– an interactive tutorial in your browser! Hamilton + Airflow(GitHub repo) Hamilton + Feast(GitHub repo) Pandas data transformations in Hamilton in 5 minutes Lineage + Hamilton in 10 minutes DAGWorks’s SubstackSubscribeSign inShare this postLLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.ioCopy linkFacebookEmailNoteOtherDiscover more from DAGWorks’s SubstackThought posts, and updates on Hamilton and the DAGWorks Platform.SubscribeContinue readingSign inLLMOps: Production prompt engineering patterns with HamiltonAn overview of the production grade ways to iterate on prompts with Hamilton.DAGWorks Inc.,Stefan Krawczyk, andThierry JeanSep 6, 20234Share this postLLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.ioCopy linkFacebookEmailNoteOtherShareWhat you send to your LLM is quite important. Small variations and changes can have large impacts on outputs, so as your product evolves, the need to evolve your prompts will too. LLMs are also constantly being developed and released, and so as LLMs change, your prompts will also need to change. Therefore it’s important to set up an iteration pattern to operationalize how you “deploy” your prompts so you and your team can move efficiently, but also ensure that production issues are minimized, if not avoided. In this post, we’ll guide you through the best practices of managing prompts with Hamilton, making analogies to MLOps patterns, and discussing trade-offs along the way.Notes:(1): if you’re looking for a post that talks about “context management” this isn’t that post. But it is the post that will help you with the nuts and bolts on how to iterate and create that production grade “prompt context management” iteration story.(2): we’ll use prompt & prompt template interchangeably.(3): we’ll assume an “online” web-service setting is where these prompts are being used.(4): we’ll be using ourHamilton’s PDF summarizer exampleto project our patterns onto.(5): not familiar withHamilton? You can either learn about Hamilton viaTry Hamiltonand come back, or get the high level LLMOps approach from this post and then dig into Hamilton via thePDF Summarizer example.(6): what’s our credibility here? We’ve spent our careers building self-service data/MLOps tooling, most famously for Stitch Fix’s 100+ Data Scientists. So we’ve seen our share of outages and approaches play out over time.Thanks for reading DAGWorks’s Substack! Subscribe for free to receive updates and posts like this!SubscribePrompts are to LLMs what hyper-parameters are to ML modelsPoint:Prompts + LLM APIs are analogous to hyper-parameters + machine learning models.In terms of “Ops” practices, LLMOps is still in its infancy. MLOps is a little older, but still neither are widely adopted if you’re comparing it to how widespread knowledge is around DevOps practices.DevOps practices largely concern themselves with how you ship code to production, and MLOps practices how to ship code& data artifacts(e.g., statistical models)to production. So what about LLMOps? Personally, I think it’s closer to MLOps since you have:your LLM workflow is simply code.and an LLM API is a data artifact that can be “tweaked” using prompts, similar to a machine learning (ML) model and its hyper-parameters.Therefore, you most likely care about versioning the LLM API + prompts together tightly for good production practices. For instance, in MLOps practice,  you’d want a process in place to validate your ML model still behaves correctly whenever its hyper-parameters are changed.How should you think about operationalizing a prompt?To be clear, the two parts to control for are theLLMand theprompts. Much like  MLOps, when the code or the model artifact changes, you want to be able to determine which did. For LLMOps, we’ll want the same discernment, separating the LLM workflow from the LLM API + prompts. Importantly, we should consider LLMs (self-hosted or APIs) to be mostly static since we less frequently update (or even control) their internals. So, changing thepromptspart of LLM API + prompts is effectively like creating a new model artifact.There are two main ways to treat prompts:Prompts as dynamic runtime variables. The template used isn’t static to a deployment.Prompts as code.The prompt template is static/ predetermined given a deployment.The main difference is the amount of moving parts you need to manage to ensure a great production story. Below, we dig into how to use Hamilton in the context of these two approaches.Prompts as dynamic runtime variablesDynamically Pass/Load PromptsPrompts are just strings. Since strings are a primitive type in most languages, this means that they are quite easy to pass around.\\xa0 The idea is to abstract your code so that at runtime you pass in the prompts required.\\xa0 More concretely, you’d “load/reload” prompt templates whenever there’s an “updated” one.The MLOps analogy here, would be to auto-reload the ML model artifact (e.g., a pkl file) whenever a new model is available.MLOps Analogy: diagram showing how ML model auto reloading would look.Diagram showing what dynamically reloading/querying prompts would look like.The benefit here is that you can very quickly roll out new prompts because you do not need to redeploy your application!The downside to this iteration speed is increased operational burden:To someone monitoring your application, it’ll be unclear when the change occurred and whether it’s propagated itself through your systems. For example, you just pushed a new prompt, and the LLM now returns more tokens per request, causing latency to spike; whoever is monitoring will likely be puzzled, unless you have a great change log culture.Rollback semantics involve having to know aboutanothersystem. You can’t just rollback a prior deployment to fix things.You’ll need great monitoring to understand what was run and when; e.g., when customer service gives you a ticket to investigate, how do you know what prompt was in use?You’ll need to manage and monitor whatever system you’re using to manage and store your prompts. This will be an extra system you’ll need to maintain outside of whatever is serving your code.You’ll need to manage two processes, one for updating and pushing the service, and one for updating and pushing prompts. Synchronizing these changes will be on you. For example, you need to make a code change to your service to handle a new prompt. You will need to coordinate changing two systems to make it work, which is extra operational overhead to manage.How it would work with HamiltonOur PDF summarizer flow would look something like this if you removesummarize_text_from_summaries_promptandsummarize_chunk_of_text_promptfunction definitions:summarization_shortened.py. Note the two inputs “*_prompt” that denote prompts that are now required as input to the dataflow to function. With Hamilton you’ll be able to determine what inputs should be required for your prompt template by just looking at a diagram like this. Diagram created via Hamilton.To operate things, you’ll want to either inject the prompts at request time:from hamilton import base, driver\\nimport summarization_shortend\\n\\n# create driver\\ndr = (\\n    driver.Builder()\\n    .with_modules(summarization_sortened)\\n    .build()\\n)\\n\\n# pull prompts from somewhere\\nsummarize_chunk_of_text_prompt = \"\"\"SOME PROMPT FOR {chunked_text}\"\"\"\\nsummarize_text_from_summaries_prompt = \"\"\"SOME PROMPT {summarized_chunks} ... {user_query}\"\"\"\\n\\n# execute, and pass in the prompt \\nresult = dr.execute(\\n   [\"summarized_text\"],\\n   inputs={\\n      \"summarize_chunk_of_text_prompt\": summarize_chunk_of_text_prompt,\\n      ...\\n   }\\n)Or\\xa0you change your code to dynamically load prompts, i.e., add functions to retrieve prompts from an external system as part of the Hamilton dataflow. At each invocation, they will query for the prompt to use (you can of course cache this for performance):# prompt_template_loaders.py\\n\\ndef summarize_chunk_of_text_prompt(\\n  db_client: Client, other_args: str) -> str:\\n    # pseudo code here, but you get the idea:\\n    _prompt = db_client.query( \\n         \"get latest prompt X from DB\", other_args)\\n    return _prompt\\n\\ndef summarize_text_from_summaries_prompt(\\n   db_client: Client, another_arg: str) -> str:\\n    # pseudo code here, but you get the idea:\\n    _prompt = db_client.query(\\n         \"get latest prompt Y from DB\", another_arg)\\n    return _promptDriver code:from hamilton import base, driver\\nimport prompt_template_loaders # <-- load this to provide prompt input\\nimport summarization_shortend\\n\\n# create driver\\ndr = (\\n    driver.Builder()\\n    .with_modules(\\n        prompt_template_loaders,# <-- Hamilton will call above functions\\n        summarization_sortened, \\n    )\\n    .build()\\n)\\n\\n# execute, and pass in the prompt \\nresult = dr.execute(\\n   [\"summarized_text\"],\\n   inputs={\\n      # don\\'t need to pass prompts in this version\\n   }\\n)How would I log prompts used and monitor flows?Here we outline a few ways to monitor what went on.Log results of execution. That is run Hamilton, then emit information to wherever you want it to go.result = dr.execute(\\n   [\"summarized_text\", \\n    \"summarize_chunk_of_text_prompt\",   \\n    ... # and anything else you want to pull out\\n    \"summarize_text_from_summaries_prompt\"],\\n   inputs={\\n      # don\\'t need to pass prompts in this version\\n   }\\n)\\n\\nmy_log_system(result) # send what you want for safe keeping to some\\n                      # system that you own.Note. In the above, Hamilton allows you to requestanyintermediateoutputs simply by requesting “functions” (i.e. nodes in the diagram) by name. If we really want to get all the intermediate outputs of the entire dataflow, we can do so and log it wherever we want to!Use loggers inside Hamilton functions (to see the power of this approach,see my old talk on structured logs):import logging\\n\\nlogger = logging.getLogger(__name__)\\n\\ndef summarize_text_from_summaries_prompt(\\n    db_client: Client, another_arg: str) -> str:\\n    # pseudo code here, but you get the idea:\\n    _prompt = db_client.query(\\n         \"get latest prompt Y from DB\", another_arg)\\n    logger.info(f\"Prompt used is [{_prompt}]\")\\n    return _promptExtend Hamilton to emit this information. You use Hamilton to capture information from executed functions, i.e. nodes, without needing to insert logging statement inside the function’s body. This promotes reusability since you can toggle logging between development and production settings at the Driver level. SeeGraphAdapters, or write your ownPython decoratorto wrap functions for monitoring.In any of the above code, you could easily pull in a 3rd party tool to help track & monitor the code, as well as the external API call, e.g. data dog. Note, with a one-line code change, you can plug in the DAGWorks’s Driver and get all that monitoring you’d want and more. (Try the free tierhere)!Prompts as codePrompts as static stringsSince prompts are simply strings, they’re also very amenable to being stored along with your source code. The idea is to store as many prompt versions as you like, within your code so that at runtime, the set of prompts available is fixed and deterministic.The MLOps analogy here is, instead of dynamically reloading models, you instead bake the ML model into the container/hard code the reference. Once deployed, your app has everything that it needs. The deployment is immutable; nothing changes once it’s up. This makes debugging & determining what’s going on, much simpler.MLOps Analogy: make an immutable deployment by making the model fixed for your app’s deployment.Diagram showing how treating prompts as code enables you to leverage your CI/CD and build an immutable deployment for talking to your LLM API.This approach has many operational benefits:Whenever a new prompt is pushed, it forces a new deployment. Rollback semantics are clear if there’s an issue with a new prompt.You can submit a pull request (PR) for the source code and prompts at the same time. It becomes simpler to review what the change is, and the downstream dependencies of what these prompts will touch/interact with.You can add checks to your CI/CD system to ensure bad prompts don’t make it to production.It’s simpler to debug an issue. You just pull the (Docker) container that was created and you’ll be able to exactly replicate any customer issue quickly and easily.There is no other “prompt system” to maintain or manage. Simplifying operations.It doesn’t preclude adding extra monitoring and visibility.How it would work with HamiltonThe prompts would be encoded into functions into the dataflow/directed acyclic graph (DAG):What summarization.py in the PDF summarizer example looks like. The prompt templates are part of the code. Diagram created via Hamilton.Pairing this code withgit, we have a lightweight versioning system for your entire dataflow (i.e. “chain”), so you can always discern what state the world was in, given a git commit SHA. If you want to manage and have access to multiple prompts at any given point in time, Hamilton has two powerful abstractions to enable you to do so:@config.whenandPython modules. This allows you to store and keep available all older prompt versions together and specify which one to use via code.@config.when (docs)Hamilton has a concept of decorators, which are just annotations on functions. The@config.whendecorator allows to specify alternative implementations for a functions, i.e. “node”, in your dataflow. In this case, we specify alternative prompts.from hamilton.function_modifiers import config\\n\\n@config.when(version=\"v1\")\\ndef summarize_chunk_of_text_prompt__v1() -> str:\\n    \"\"\"V1 prompt for summarizing chunks of text.\"\"\"\\n    return f\"Summarize this text. Extract any key points with reasoning.\\\\n\\\\nContent:\"\\n\\n@config.when(version=\"v2\")\\ndef summarize_chunk_of_text_prompt__v2(content_type: str = \"an academic paper\") -> str:\\n    \"\"\"V2 prompt for summarizing chunks of text.\"\"\"\\n    return f\"Summarize this text from {content_type}. Extract the key points with reasoning. \\\\n\\\\nContent:\"You can keep adding functions annotated with@config.when, allowing you to swap between them using configuration passed to the HamiltonDriver. When instantiating theDriver,  it will construct the dataflow using the prompt implementation associated with the configuration value.from hamilton import base, driver\\nimport summarization\\n\\n# create driver\\ndr = (\\n    driver.Builder()\\n    .with_modules(summarization)\\n    .with_config({\"version\": \"v1\"}) # V1 is chosen. Use \"v2\\' for V2.\\n    .build()\\n)Module switchingAlternatively to using@config.when, you can instead place your different prompt implementations into different Python modules. Then, atDriverconstruction time, pass the correct module for the context you want to use.So here we have one module housing V1 of our prompt:# prompts_v1.py\\ndef summarize_chunk_of_text_prompt() -> str:\\n    \"\"\"V1 prompt for summarizing chunks of text.\"\"\"\\n    return f\"Summarize this text. Extract any key points with reasoning.\\\\n\\\\nContent:\"Here we have one module housing V2 (see how they differ slightly):# prompts_v2.py\\ndef summarize_chunk_of_text_prompt(content_type: str = \"an academic paper\") -> str:\\n    \"\"\"V2 prompt for summarizing chunks of text.\"\"\"\\n    return f\"Summarize this text from {content_type}. Extract the key points with reasoning. \\\\n\\\\nContent:\"In the driver code below, we choose the right module to use based on some context.# run.py\\nfrom hamilton import driver\\nimport summarization\\nimport prompts_v1\\nimport prompts_v2\\n\\n# create driver -- passing in the right module we want\\ndr = (\\n    driver.Builder()\\n    .with_modules(\\n        prompts_v1,  # or prompts_v2\\n        summarization,\\n    )\\n    .build()\\n)Using the module approach allows us to encapsulate and version whole sets of prompts together. If you want to go back in time (via git), or see what a blessed prompt version was, you just need to navigate to the correct commit, and then look in the right module.How would I log prompts used and monitor flows?Assuming you’re using git to track your code, you wouldn’t need to record what prompts were being used. Instead, you’d just need to know what git commit SHA is deployed and you’ll be able to track the version of your code and prompts simultaneously.To monitor flows, just like the above approach, you have the same monitoring hooks available at your disposal, and I wont repeat them here, but they are:Request any intermediate outputs and log them yourself outside of Hamilton.Log them from within the function yourself, or build aPython decorator/GraphAdapterto do it at the framework level.Integrate 3rd party tooling for monitoring your code and LLM API calls, or use the DAGWorks Platform offering to monitor it all. (Try the free tierhere)!or all the above!What about A/B testing my prompts?With any ML initiative, it’s important to measure business impacts of changes. Likewise, with LLMs + prompts, it’ll be important to test and measure changes against important business metrics. In the MLOps world, you’d be A/B testing ML models to evaluate their business value by dividing traffic between them. To ensure the randomness necessary to A/B tests, you wouldn’t know at runtime which model to use until a coin is flipped. However, to get those models out, they both would have follow a process to qualify them. So for prompts, we should think similarly.The above two prompt engineering patterns don’t preclude you from being able to A/B test prompts, but it means you need to manage a process to enable however many prompt templates you’re testing in parallel. If you’re also adjusting code paths, having them in code will be simpler to discern and debug what is going on, and you can make use of the `@config.when` decorator / python module swapping for this purpose. Versus, having to critically rely on your logging/monitoring/observability stack to tell you what prompt was used if you’re dynamically loading/passing them in and then having to mentally map which prompts go with which code paths.Note, this all gets harder if you start needing to change multiple prompts for an A/B test because you have several of them in a flow. For example you have two prompts in your workflow and you’re changing LLMs, you’ll want to A/B test the change holistically, rather than individually per prompt. Our advice, by putting the prompts into code your operational life will be simpler, since you’ll know what two prompts belong to what code paths without having to do any mental mapping.Thank you for reading DAGWorks’s Substack. This post is public so feel free to share it.ShareSummaryIn this post, we explained two patterns for managing prompts in a production environment with Hamilton. The first approach treatsprompts asdynamic runtime variables,while the second, treatsprompts as codefor production settings. If you value reducing operational burden, then our advice is to encode prompts as code, as it is operationally simpler, unless the speed to change them really matters for you.To recap:Prompts as dynamic runtime variables. Use an external system to pass the prompts to your Hamilton dataflows, or use Hamilton to pull them from a DB. For debugging & monitoring, it’s important to be able to determine what prompt was used for a given invocation. You can integrate open source tools, or use something like the DAGWorks Platform to help ensure you know what was used for any invocation of your code.Prompts as code.Encoding the prompts as code allows easy versioning with git. Change management can be done via pull requests and CI/CD checks. It works well with Hamilton’s features like@config.whenand module switching at the Driver level because it determines clearly what version of the prompt is used. This approach strengthens the use of any tooling you might use to monitor or track, like the DAGWorks Platform, as prompts for a deployment are immutable.We want to hear from you!If you’re excited by any of this, or have strong opinions, leave a comment, or drop by our Slack channel! Some links to do praise/complain/chat:📣join our community on Slack—\\u200awe’re more than happy to help answer questions you might have or get you started.⭐️ us onGitHub.📝 leave us anissueif you find something.📚 read ourdocumentation.⌨️ interactivelylearn about Hamilton in your browser.Other Hamilton posts you might be interested in:We have a growing collection of posts & content. Here are some we think you might be interested in.Containerized PDF Summarizer with FastAPI and HamiltonThierry Jean,DAGWorks Inc., andStefan Krawczyk·Aug 18Skip learning convoluted LLM-specific frameworks and write your first LLM application using regular Python functions and Hamilton! In this post, we’ll present a containerized PDF summarizer powered by the OpenAI API. Its flow is encoded in Hamilton, which theRead full storyBuilding a maintainable and modular LLM application stack with HamiltonThierry JeanandDAGWorks Inc.·Jul 11In this post, we’re going to share how Hamilton can help you write modular and maintainable code for your large language model (LLM) application stack. Hamilton is great for describing any type of dataflow, which is exactly what you’re doing when building an LLM powered application. With Hamilton you get strongRead full storytryhamilton.dev– an interactive tutorial in your browser!Hamilton + Airflow(GitHub repo)Hamilton + Feast(GitHub repo)Pandas data transformations in Hamilton in 5 minutesLineage + Hamilton in 10 minutes4Share this postLLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.ioCopy linkFacebookEmailNoteOtherShareCommentsTopNewNo postsReady for more?Subscribe© 2023 DAGWorks Inc.Privacy∙Terms∙Collection noticeStart WritingGet the appSubstackis the home for great writing DAGWorks’s SubstackSubscribeSign inShare this postLLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.ioCopy linkFacebookEmailNoteOtherDiscover more from DAGWorks’s SubstackThought posts, and updates on Hamilton and the DAGWorks Platform.SubscribeContinue readingSign inLLMOps: Production prompt engineering patterns with HamiltonAn overview of the production grade ways to iterate on prompts with Hamilton.DAGWorks Inc.,Stefan Krawczyk, andThierry JeanSep 6, 20234Share this postLLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.ioCopy linkFacebookEmailNoteOtherShareWhat you send to your LLM is quite important. Small variations and changes can have large impacts on outputs, so as your product evolves, the need to evolve your prompts will too. LLMs are also constantly being developed and released, and so as LLMs change, your prompts will also need to change. Therefore it’s important to set up an iteration pattern to operationalize how you “deploy” your prompts so you and your team can move efficiently, but also ensure that production issues are minimized, if not avoided. In this post, we’ll guide you through the best practices of managing prompts with Hamilton, making analogies to MLOps patterns, and discussing trade-offs along the way.Notes:(1): if you’re looking for a post that talks about “context management” this isn’t that post. But it is the post that will help you with the nuts and bolts on how to iterate and create that production grade “prompt context management” iteration story.(2): we’ll use prompt & prompt template interchangeably.(3): we’ll assume an “online” web-service setting is where these prompts are being used.(4): we’ll be using ourHamilton’s PDF summarizer exampleto project our patterns onto.(5): not familiar withHamilton? You can either learn about Hamilton viaTry Hamiltonand come back, or get the high level LLMOps approach from this post and then dig into Hamilton via thePDF Summarizer example.(6): what’s our credibility here? We’ve spent our careers building self-service data/MLOps tooling, most famously for Stitch Fix’s 100+ Data Scientists. So we’ve seen our share of outages and approaches play out over time.Thanks for reading DAGWorks’s Substack! Subscribe for free to receive updates and posts like this!SubscribePrompts are to LLMs what hyper-parameters are to ML modelsPoint:Prompts + LLM APIs are analogous to hyper-parameters + machine learning models.In terms of “Ops” practices, LLMOps is still in its infancy. MLOps is a little older, but still neither are widely adopted if you’re comparing it to how widespread knowledge is around DevOps practices.DevOps practices largely concern themselves with how you ship code to production, and MLOps practices how to ship code& data artifacts(e.g., statistical models)to production. So what about LLMOps? Personally, I think it’s closer to MLOps since you have:your LLM workflow is simply code.and an LLM API is a data artifact that can be “tweaked” using prompts, similar to a machine learning (ML) model and its hyper-parameters.Therefore, you most likely care about versioning the LLM API + prompts together tightly for good production practices. For instance, in MLOps practice,  you’d want a process in place to validate your ML model still behaves correctly whenever its hyper-parameters are changed.How should you think about operationalizing a prompt?To be clear, the two parts to control for are theLLMand theprompts. Much like  MLOps, when the code or the model artifact changes, you want to be able to determine which did. For LLMOps, we’ll want the same discernment, separating the LLM workflow from the LLM API + prompts. Importantly, we should consider LLMs (self-hosted or APIs) to be mostly static since we less frequently update (or even control) their internals. So, changing thepromptspart of LLM API + prompts is effectively like creating a new model artifact.There are two main ways to treat prompts:Prompts as dynamic runtime variables. The template used isn’t static to a deployment.Prompts as code.The prompt template is static/ predetermined given a deployment.The main difference is the amount of moving parts you need to manage to ensure a great production story. Below, we dig into how to use Hamilton in the context of these two approaches.Prompts as dynamic runtime variablesDynamically Pass/Load PromptsPrompts are just strings. Since strings are a primitive type in most languages, this means that they are quite easy to pass around.\\xa0 The idea is to abstract your code so that at runtime you pass in the prompts required.\\xa0 More concretely, you’d “load/reload” prompt templates whenever there’s an “updated” one.The MLOps analogy here, would be to auto-reload the ML model artifact (e.g., a pkl file) whenever a new model is available.MLOps Analogy: diagram showing how ML model auto reloading would look.Diagram showing what dynamically reloading/querying prompts would look like.The benefit here is that you can very quickly roll out new prompts because you do not need to redeploy your application!The downside to this iteration speed is increased operational burden:To someone monitoring your application, it’ll be unclear when the change occurred and whether it’s propagated itself through your systems. For example, you just pushed a new prompt, and the LLM now returns more tokens per request, causing latency to spike; whoever is monitoring will likely be puzzled, unless you have a great change log culture.Rollback semantics involve having to know aboutanothersystem. You can’t just rollback a prior deployment to fix things.You’ll need great monitoring to understand what was run and when; e.g., when customer service gives you a ticket to investigate, how do you know what prompt was in use?You’ll need to manage and monitor whatever system you’re using to manage and store your prompts. This will be an extra system you’ll need to maintain outside of whatever is serving your code.You’ll need to manage two processes, one for updating and pushing the service, and one for updating and pushing prompts. Synchronizing these changes will be on you. For example, you need to make a code change to your service to handle a new prompt. You will need to coordinate changing two systems to make it work, which is extra operational overhead to manage.How it would work with HamiltonOur PDF summarizer flow would look something like this if you removesummarize_text_from_summaries_promptandsummarize_chunk_of_text_promptfunction definitions:summarization_shortened.py. Note the two inputs “*_prompt” that denote prompts that are now required as input to the dataflow to function. With Hamilton you’ll be able to determine what inputs should be required for your prompt template by just looking at a diagram like this. Diagram created via Hamilton.To operate things, you’ll want to either inject the prompts at request time:from hamilton import base, driver\\nimport summarization_shortend\\n\\n# create driver\\ndr = (\\n    driver.Builder()\\n    .with_modules(summarization_sortened)\\n    .build()\\n)\\n\\n# pull prompts from somewhere\\nsummarize_chunk_of_text_prompt = \"\"\"SOME PROMPT FOR {chunked_text}\"\"\"\\nsummarize_text_from_summaries_prompt = \"\"\"SOME PROMPT {summarized_chunks} ... {user_query}\"\"\"\\n\\n# execute, and pass in the prompt \\nresult = dr.execute(\\n   [\"summarized_text\"],\\n   inputs={\\n      \"summarize_chunk_of_text_prompt\": summarize_chunk_of_text_prompt,\\n      ...\\n   }\\n)Or\\xa0you change your code to dynamically load prompts, i.e., add functions to retrieve prompts from an external system as part of the Hamilton dataflow. At each invocation, they will query for the prompt to use (you can of course cache this for performance):# prompt_template_loaders.py\\n\\ndef summarize_chunk_of_text_prompt(\\n  db_client: Client, other_args: str) -> str:\\n    # pseudo code here, but you get the idea:\\n    _prompt = db_client.query( \\n         \"get latest prompt X from DB\", other_args)\\n    return _prompt\\n\\ndef summarize_text_from_summaries_prompt(\\n   db_client: Client, another_arg: str) -> str:\\n    # pseudo code here, but you get the idea:\\n    _prompt = db_client.query(\\n         \"get latest prompt Y from DB\", another_arg)\\n    return _promptDriver code:from hamilton import base, driver\\nimport prompt_template_loaders # <-- load this to provide prompt input\\nimport summarization_shortend\\n\\n# create driver\\ndr = (\\n    driver.Builder()\\n    .with_modules(\\n        prompt_template_loaders,# <-- Hamilton will call above functions\\n        summarization_sortened, \\n    )\\n    .build()\\n)\\n\\n# execute, and pass in the prompt \\nresult = dr.execute(\\n   [\"summarized_text\"],\\n   inputs={\\n      # don\\'t need to pass prompts in this version\\n   }\\n)How would I log prompts used and monitor flows?Here we outline a few ways to monitor what went on.Log results of execution. That is run Hamilton, then emit information to wherever you want it to go.result = dr.execute(\\n   [\"summarized_text\", \\n    \"summarize_chunk_of_text_prompt\",   \\n    ... # and anything else you want to pull out\\n    \"summarize_text_from_summaries_prompt\"],\\n   inputs={\\n      # don\\'t need to pass prompts in this version\\n   }\\n)\\n\\nmy_log_system(result) # send what you want for safe keeping to some\\n                      # system that you own.Note. In the above, Hamilton allows you to requestanyintermediateoutputs simply by requesting “functions” (i.e. nodes in the diagram) by name. If we really want to get all the intermediate outputs of the entire dataflow, we can do so and log it wherever we want to!Use loggers inside Hamilton functions (to see the power of this approach,see my old talk on structured logs):import logging\\n\\nlogger = logging.getLogger(__name__)\\n\\ndef summarize_text_from_summaries_prompt(\\n    db_client: Client, another_arg: str) -> str:\\n    # pseudo code here, but you get the idea:\\n    _prompt = db_client.query(\\n         \"get latest prompt Y from DB\", another_arg)\\n    logger.info(f\"Prompt used is [{_prompt}]\")\\n    return _promptExtend Hamilton to emit this information. You use Hamilton to capture information from executed functions, i.e. nodes, without needing to insert logging statement inside the function’s body. This promotes reusability since you can toggle logging between development and production settings at the Driver level. SeeGraphAdapters, or write your ownPython decoratorto wrap functions for monitoring.In any of the above code, you could easily pull in a 3rd party tool to help track & monitor the code, as well as the external API call, e.g. data dog. Note, with a one-line code change, you can plug in the DAGWorks’s Driver and get all that monitoring you’d want and more. (Try the free tierhere)!Prompts as codePrompts as static stringsSince prompts are simply strings, they’re also very amenable to being stored along with your source code. The idea is to store as many prompt versions as you like, within your code so that at runtime, the set of prompts available is fixed and deterministic.The MLOps analogy here is, instead of dynamically reloading models, you instead bake the ML model into the container/hard code the reference. Once deployed, your app has everything that it needs. The deployment is immutable; nothing changes once it’s up. This makes debugging & determining what’s going on, much simpler.MLOps Analogy: make an immutable deployment by making the model fixed for your app’s deployment.Diagram showing how treating prompts as code enables you to leverage your CI/CD and build an immutable deployment for talking to your LLM API.This approach has many operational benefits:Whenever a new prompt is pushed, it forces a new deployment. Rollback semantics are clear if there’s an issue with a new prompt.You can submit a pull request (PR) for the source code and prompts at the same time. It becomes simpler to review what the change is, and the downstream dependencies of what these prompts will touch/interact with.You can add checks to your CI/CD system to ensure bad prompts don’t make it to production.It’s simpler to debug an issue. You just pull the (Docker) container that was created and you’ll be able to exactly replicate any customer issue quickly and easily.There is no other “prompt system” to maintain or manage. Simplifying operations.It doesn’t preclude adding extra monitoring and visibility.How it would work with HamiltonThe prompts would be encoded into functions into the dataflow/directed acyclic graph (DAG):What summarization.py in the PDF summarizer example looks like. The prompt templates are part of the code. Diagram created via Hamilton.Pairing this code withgit, we have a lightweight versioning system for your entire dataflow (i.e. “chain”), so you can always discern what state the world was in, given a git commit SHA. If you want to manage and have access to multiple prompts at any given point in time, Hamilton has two powerful abstractions to enable you to do so:@config.whenandPython modules. This allows you to store and keep available all older prompt versions together and specify which one to use via code.@config.when (docs)Hamilton has a concept of decorators, which are just annotations on functions. The@config.whendecorator allows to specify alternative implementations for a functions, i.e. “node”, in your dataflow. In this case, we specify alternative prompts.from hamilton.function_modifiers import config\\n\\n@config.when(version=\"v1\")\\ndef summarize_chunk_of_text_prompt__v1() -> str:\\n    \"\"\"V1 prompt for summarizing chunks of text.\"\"\"\\n    return f\"Summarize this text. Extract any key points with reasoning.\\\\n\\\\nContent:\"\\n\\n@config.when(version=\"v2\")\\ndef summarize_chunk_of_text_prompt__v2(content_type: str = \"an academic paper\") -> str:\\n    \"\"\"V2 prompt for summarizing chunks of text.\"\"\"\\n    return f\"Summarize this text from {content_type}. Extract the key points with reasoning. \\\\n\\\\nContent:\"You can keep adding functions annotated with@config.when, allowing you to swap between them using configuration passed to the HamiltonDriver. When instantiating theDriver,  it will construct the dataflow using the prompt implementation associated with the configuration value.from hamilton import base, driver\\nimport summarization\\n\\n# create driver\\ndr = (\\n    driver.Builder()\\n    .with_modules(summarization)\\n    .with_config({\"version\": \"v1\"}) # V1 is chosen. Use \"v2\\' for V2.\\n    .build()\\n)Module switchingAlternatively to using@config.when, you can instead place your different prompt implementations into different Python modules. Then, atDriverconstruction time, pass the correct module for the context you want to use.So here we have one module housing V1 of our prompt:# prompts_v1.py\\ndef summarize_chunk_of_text_prompt() -> str:\\n    \"\"\"V1 prompt for summarizing chunks of text.\"\"\"\\n    return f\"Summarize this text. Extract any key points with reasoning.\\\\n\\\\nContent:\"Here we have one module housing V2 (see how they differ slightly):# prompts_v2.py\\ndef summarize_chunk_of_text_prompt(content_type: str = \"an academic paper\") -> str:\\n    \"\"\"V2 prompt for summarizing chunks of text.\"\"\"\\n    return f\"Summarize this text from {content_type}. Extract the key points with reasoning. \\\\n\\\\nContent:\"In the driver code below, we choose the right module to use based on some context.# run.py\\nfrom hamilton import driver\\nimport summarization\\nimport prompts_v1\\nimport prompts_v2\\n\\n# create driver -- passing in the right module we want\\ndr = (\\n    driver.Builder()\\n    .with_modules(\\n        prompts_v1,  # or prompts_v2\\n        summarization,\\n    )\\n    .build()\\n)Using the module approach allows us to encapsulate and version whole sets of prompts together. If you want to go back in time (via git), or see what a blessed prompt version was, you just need to navigate to the correct commit, and then look in the right module.How would I log prompts used and monitor flows?Assuming you’re using git to track your code, you wouldn’t need to record what prompts were being used. Instead, you’d just need to know what git commit SHA is deployed and you’ll be able to track the version of your code and prompts simultaneously.To monitor flows, just like the above approach, you have the same monitoring hooks available at your disposal, and I wont repeat them here, but they are:Request any intermediate outputs and log them yourself outside of Hamilton.Log them from within the function yourself, or build aPython decorator/GraphAdapterto do it at the framework level.Integrate 3rd party tooling for monitoring your code and LLM API calls, or use the DAGWorks Platform offering to monitor it all. (Try the free tierhere)!or all the above!What about A/B testing my prompts?With any ML initiative, it’s important to measure business impacts of changes. Likewise, with LLMs + prompts, it’ll be important to test and measure changes against important business metrics. In the MLOps world, you’d be A/B testing ML models to evaluate their business value by dividing traffic between them. To ensure the randomness necessary to A/B tests, you wouldn’t know at runtime which model to use until a coin is flipped. However, to get those models out, they both would have follow a process to qualify them. So for prompts, we should think similarly.The above two prompt engineering patterns don’t preclude you from being able to A/B test prompts, but it means you need to manage a process to enable however many prompt templates you’re testing in parallel. If you’re also adjusting code paths, having them in code will be simpler to discern and debug what is going on, and you can make use of the `@config.when` decorator / python module swapping for this purpose. Versus, having to critically rely on your logging/monitoring/observability stack to tell you what prompt was used if you’re dynamically loading/passing them in and then having to mentally map which prompts go with which code paths.Note, this all gets harder if you start needing to change multiple prompts for an A/B test because you have several of them in a flow. For example you have two prompts in your workflow and you’re changing LLMs, you’ll want to A/B test the change holistically, rather than individually per prompt. Our advice, by putting the prompts into code your operational life will be simpler, since you’ll know what two prompts belong to what code paths without having to do any mental mapping.Thank you for reading DAGWorks’s Substack. This post is public so feel free to share it.ShareSummaryIn this post, we explained two patterns for managing prompts in a production environment with Hamilton. The first approach treatsprompts asdynamic runtime variables,while the second, treatsprompts as codefor production settings. If you value reducing operational burden, then our advice is to encode prompts as code, as it is operationally simpler, unless the speed to change them really matters for you.To recap:Prompts as dynamic runtime variables. Use an external system to pass the prompts to your Hamilton dataflows, or use Hamilton to pull them from a DB. For debugging & monitoring, it’s important to be able to determine what prompt was used for a given invocation. You can integrate open source tools, or use something like the DAGWorks Platform to help ensure you know what was used for any invocation of your code.Prompts as code.Encoding the prompts as code allows easy versioning with git. Change management can be done via pull requests and CI/CD checks. It works well with Hamilton’s features like@config.whenand module switching at the Driver level because it determines clearly what version of the prompt is used. This approach strengthens the use of any tooling you might use to monitor or track, like the DAGWorks Platform, as prompts for a deployment are immutable.We want to hear from you!If you’re excited by any of this, or have strong opinions, leave a comment, or drop by our Slack channel! Some links to do praise/complain/chat:📣join our community on Slack—\\u200awe’re more than happy to help answer questions you might have or get you started.⭐️ us onGitHub.📝 leave us anissueif you find something.📚 read ourdocumentation.⌨️ interactivelylearn about Hamilton in your browser.Other Hamilton posts you might be interested in:We have a growing collection of posts & content. Here are some we think you might be interested in.Containerized PDF Summarizer with FastAPI and HamiltonThierry Jean,DAGWorks Inc., andStefan Krawczyk·Aug 18Skip learning convoluted LLM-specific frameworks and write your first LLM application using regular Python functions and Hamilton! In this post, we’ll present a containerized PDF summarizer powered by the OpenAI API. Its flow is encoded in Hamilton, which theRead full storyBuilding a maintainable and modular LLM application stack with HamiltonThierry JeanandDAGWorks Inc.·Jul 11In this post, we’re going to share how Hamilton can help you write modular and maintainable code for your large language model (LLM) application stack. Hamilton is great for describing any type of dataflow, which is exactly what you’re doing when building an LLM powered application. With Hamilton you get strongRead full storytryhamilton.dev– an interactive tutorial in your browser!Hamilton + Airflow(GitHub repo)Hamilton + Feast(GitHub repo)Pandas data transformations in Hamilton in 5 minutesLineage + Hamilton in 10 minutes4Share this postLLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.ioCopy linkFacebookEmailNoteOtherShareCommentsTopNewNo postsReady for more?Subscribe© 2023 DAGWorks Inc.Privacy∙Terms∙Collection noticeStart WritingGet the appSubstackis the home for great writing DAGWorks’s SubstackSubscribeSign in DAGWorks’s SubstackSubscribeSign in DAGWorks’s SubstackSubscribeSign in DAGWorks’s SubstackSubscribeSign in  SubscribeSign in SubscribeSign in  Subscribe  Share this postLLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.ioCopy linkFacebookEmailNoteOtherDiscover more from DAGWorks’s SubstackThought posts, and updates on Hamilton and the DAGWorks Platform.SubscribeContinue readingSign inLLMOps: Production prompt engineering patterns with HamiltonAn overview of the production grade ways to iterate on prompts with Hamilton.DAGWorks Inc.,Stefan Krawczyk, andThierry JeanSep 6, 20234Share this postLLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.ioCopy linkFacebookEmailNoteOtherShareWhat you send to your LLM is quite important. Small variations and changes can have large impacts on outputs, so as your product evolves, the need to evolve your prompts will too. LLMs are also constantly being developed and released, and so as LLMs change, your prompts will also need to change. Therefore it’s important to set up an iteration pattern to operationalize how you “deploy” your prompts so you and your team can move efficiently, but also ensure that production issues are minimized, if not avoided. In this post, we’ll guide you through the best practices of managing prompts with Hamilton, making analogies to MLOps patterns, and discussing trade-offs along the way.Notes:(1): if you’re looking for a post that talks about “context management” this isn’t that post. But it is the post that will help you with the nuts and bolts on how to iterate and create that production grade “prompt context management” iteration story.(2): we’ll use prompt & prompt template interchangeably.(3): we’ll assume an “online” web-service setting is where these prompts are being used.(4): we’ll be using ourHamilton’s PDF summarizer exampleto project our patterns onto.(5): not familiar withHamilton? You can either learn about Hamilton viaTry Hamiltonand come back, or get the high level LLMOps approach from this post and then dig into Hamilton via thePDF Summarizer example.(6): what’s our credibility here? We’ve spent our careers building self-service data/MLOps tooling, most famously for Stitch Fix’s 100+ Data Scientists. So we’ve seen our share of outages and approaches play out over time.Thanks for reading DAGWorks’s Substack! Subscribe for free to receive updates and posts like this!SubscribePrompts are to LLMs what hyper-parameters are to ML modelsPoint:Prompts + LLM APIs are analogous to hyper-parameters + machine learning models.In terms of “Ops” practices, LLMOps is still in its infancy. MLOps is a little older, but still neither are widely adopted if you’re comparing it to how widespread knowledge is around DevOps practices.DevOps practices largely concern themselves with how you ship code to production, and MLOps practices how to ship code& data artifacts(e.g., statistical models)to production. So what about LLMOps? Personally, I think it’s closer to MLOps since you have:your LLM workflow is simply code.and an LLM API is a data artifact that can be “tweaked” using prompts, similar to a machine learning (ML) model and its hyper-parameters.Therefore, you most likely care about versioning the LLM API + prompts together tightly for good production practices. For instance, in MLOps practice,  you’d want a process in place to validate your ML model still behaves correctly whenever its hyper-parameters are changed.How should you think about operationalizing a prompt?To be clear, the two parts to control for are theLLMand theprompts. Much like  MLOps, when the code or the model artifact changes, you want to be able to determine which did. For LLMOps, we’ll want the same discernment, separating the LLM workflow from the LLM API + prompts. Importantly, we should consider LLMs (self-hosted or APIs) to be mostly static since we less frequently update (or even control) their internals. So, changing thepromptspart of LLM API + prompts is effectively like creating a new model artifact.There are two main ways to treat prompts:Prompts as dynamic runtime variables. The template used isn’t static to a deployment.Prompts as code.The prompt template is static/ predetermined given a deployment.The main difference is the amount of moving parts you need to manage to ensure a great production story. Below, we dig into how to use Hamilton in the context of these two approaches.Prompts as dynamic runtime variablesDynamically Pass/Load PromptsPrompts are just strings. Since strings are a primitive type in most languages, this means that they are quite easy to pass around.\\xa0 The idea is to abstract your code so that at runtime you pass in the prompts required.\\xa0 More concretely, you’d “load/reload” prompt templates whenever there’s an “updated” one.The MLOps analogy here, would be to auto-reload the ML model artifact (e.g., a pkl file) whenever a new model is available.MLOps Analogy: diagram showing how ML model auto reloading would look.Diagram showing what dynamically reloading/querying prompts would look like.The benefit here is that you can very quickly roll out new prompts because you do not need to redeploy your application!The downside to this iteration speed is increased operational burden:To someone monitoring your application, it’ll be unclear when the change occurred and whether it’s propagated itself through your systems. For example, you just pushed a new prompt, and the LLM now returns more tokens per request, causing latency to spike; whoever is monitoring will likely be puzzled, unless you have a great change log culture.Rollback semantics involve having to know aboutanothersystem. You can’t just rollback a prior deployment to fix things.You’ll need great monitoring to understand what was run and when; e.g., when customer service gives you a ticket to investigate, how do you know what prompt was in use?You’ll need to manage and monitor whatever system you’re using to manage and store your prompts. This will be an extra system you’ll need to maintain outside of whatever is serving your code.You’ll need to manage two processes, one for updating and pushing the service, and one for updating and pushing prompts. Synchronizing these changes will be on you. For example, you need to make a code change to your service to handle a new prompt. You will need to coordinate changing two systems to make it work, which is extra operational overhead to manage.How it would work with HamiltonOur PDF summarizer flow would look something like this if you removesummarize_text_from_summaries_promptandsummarize_chunk_of_text_promptfunction definitions:summarization_shortened.py. Note the two inputs “*_prompt” that denote prompts that are now required as input to the dataflow to function. With Hamilton you’ll be able to determine what inputs should be required for your prompt template by just looking at a diagram like this. Diagram created via Hamilton.To operate things, you’ll want to either inject the prompts at request time:from hamilton import base, driver\\nimport summarization_shortend\\n\\n# create driver\\ndr = (\\n    driver.Builder()\\n    .with_modules(summarization_sortened)\\n    .build()\\n)\\n\\n# pull prompts from somewhere\\nsummarize_chunk_of_text_prompt = \"\"\"SOME PROMPT FOR {chunked_text}\"\"\"\\nsummarize_text_from_summaries_prompt = \"\"\"SOME PROMPT {summarized_chunks} ... {user_query}\"\"\"\\n\\n# execute, and pass in the prompt \\nresult = dr.execute(\\n   [\"summarized_text\"],\\n   inputs={\\n      \"summarize_chunk_of_text_prompt\": summarize_chunk_of_text_prompt,\\n      ...\\n   }\\n)Or\\xa0you change your code to dynamically load prompts, i.e., add functions to retrieve prompts from an external system as part of the Hamilton dataflow. At each invocation, they will query for the prompt to use (you can of course cache this for performance):# prompt_template_loaders.py\\n\\ndef summarize_chunk_of_text_prompt(\\n  db_client: Client, other_args: str) -> str:\\n    # pseudo code here, but you get the idea:\\n    _prompt = db_client.query( \\n         \"get latest prompt X from DB\", other_args)\\n    return _prompt\\n\\ndef summarize_text_from_summaries_prompt(\\n   db_client: Client, another_arg: str) -> str:\\n    # pseudo code here, but you get the idea:\\n    _prompt = db_client.query(\\n         \"get latest prompt Y from DB\", another_arg)\\n    return _promptDriver code:from hamilton import base, driver\\nimport prompt_template_loaders # <-- load this to provide prompt input\\nimport summarization_shortend\\n\\n# create driver\\ndr = (\\n    driver.Builder()\\n    .with_modules(\\n        prompt_template_loaders,# <-- Hamilton will call above functions\\n        summarization_sortened, \\n    )\\n    .build()\\n)\\n\\n# execute, and pass in the prompt \\nresult = dr.execute(\\n   [\"summarized_text\"],\\n   inputs={\\n      # don\\'t need to pass prompts in this version\\n   }\\n)How would I log prompts used and monitor flows?Here we outline a few ways to monitor what went on.Log results of execution. That is run Hamilton, then emit information to wherever you want it to go.result = dr.execute(\\n   [\"summarized_text\", \\n    \"summarize_chunk_of_text_prompt\",   \\n    ... # and anything else you want to pull out\\n    \"summarize_text_from_summaries_prompt\"],\\n   inputs={\\n      # don\\'t need to pass prompts in this version\\n   }\\n)\\n\\nmy_log_system(result) # send what you want for safe keeping to some\\n                      # system that you own.Note. In the above, Hamilton allows you to requestanyintermediateoutputs simply by requesting “functions” (i.e. nodes in the diagram) by name. If we really want to get all the intermediate outputs of the entire dataflow, we can do so and log it wherever we want to!Use loggers inside Hamilton functions (to see the power of this approach,see my old talk on structured logs):import logging\\n\\nlogger = logging.getLogger(__name__)\\n\\ndef summarize_text_from_summaries_prompt(\\n    db_client: Client, another_arg: str) -> str:\\n    # pseudo code here, but you get the idea:\\n    _prompt = db_client.query(\\n         \"get latest prompt Y from DB\", another_arg)\\n    logger.info(f\"Prompt used is [{_prompt}]\")\\n    return _promptExtend Hamilton to emit this information. You use Hamilton to capture information from executed functions, i.e. nodes, without needing to insert logging statement inside the function’s body. This promotes reusability since you can toggle logging between development and production settings at the Driver level. SeeGraphAdapters, or write your ownPython decoratorto wrap functions for monitoring.In any of the above code, you could easily pull in a 3rd party tool to help track & monitor the code, as well as the external API call, e.g. data dog. Note, with a one-line code change, you can plug in the DAGWorks’s Driver and get all that monitoring you’d want and more. (Try the free tierhere)!Prompts as codePrompts as static stringsSince prompts are simply strings, they’re also very amenable to being stored along with your source code. The idea is to store as many prompt versions as you like, within your code so that at runtime, the set of prompts available is fixed and deterministic.The MLOps analogy here is, instead of dynamically reloading models, you instead bake the ML model into the container/hard code the reference. Once deployed, your app has everything that it needs. The deployment is immutable; nothing changes once it’s up. This makes debugging & determining what’s going on, much simpler.MLOps Analogy: make an immutable deployment by making the model fixed for your app’s deployment.Diagram showing how treating prompts as code enables you to leverage your CI/CD and build an immutable deployment for talking to your LLM API.This approach has many operational benefits:Whenever a new prompt is pushed, it forces a new deployment. Rollback semantics are clear if there’s an issue with a new prompt.You can submit a pull request (PR) for the source code and prompts at the same time. It becomes simpler to review what the change is, and the downstream dependencies of what these prompts will touch/interact with.You can add checks to your CI/CD system to ensure bad prompts don’t make it to production.It’s simpler to debug an issue. You just pull the (Docker) container that was created and you’ll be able to exactly replicate any customer issue quickly and easily.There is no other “prompt system” to maintain or manage. Simplifying operations.It doesn’t preclude adding extra monitoring and visibility.How it would work with HamiltonThe prompts would be encoded into functions into the dataflow/directed acyclic graph (DAG):What summarization.py in the PDF summarizer example looks like. The prompt templates are part of the code. Diagram created via Hamilton.Pairing this code withgit, we have a lightweight versioning system for your entire dataflow (i.e. “chain”), so you can always discern what state the world was in, given a git commit SHA. If you want to manage and have access to multiple prompts at any given point in time, Hamilton has two powerful abstractions to enable you to do so:@config.whenandPython modules. This allows you to store and keep available all older prompt versions together and specify which one to use via code.@config.when (docs)Hamilton has a concept of decorators, which are just annotations on functions. The@config.whendecorator allows to specify alternative implementations for a functions, i.e. “node”, in your dataflow. In this case, we specify alternative prompts.from hamilton.function_modifiers import config\\n\\n@config.when(version=\"v1\")\\ndef summarize_chunk_of_text_prompt__v1() -> str:\\n    \"\"\"V1 prompt for summarizing chunks of text.\"\"\"\\n    return f\"Summarize this text. Extract any key points with reasoning.\\\\n\\\\nContent:\"\\n\\n@config.when(version=\"v2\")\\ndef summarize_chunk_of_text_prompt__v2(content_type: str = \"an academic paper\") -> str:\\n    \"\"\"V2 prompt for summarizing chunks of text.\"\"\"\\n    return f\"Summarize this text from {content_type}. Extract the key points with reasoning. \\\\n\\\\nContent:\"You can keep adding functions annotated with@config.when, allowing you to swap between them using configuration passed to the HamiltonDriver. When instantiating theDriver,  it will construct the dataflow using the prompt implementation associated with the configuration value.from hamilton import base, driver\\nimport summarization\\n\\n# create driver\\ndr = (\\n    driver.Builder()\\n    .with_modules(summarization)\\n    .with_config({\"version\": \"v1\"}) # V1 is chosen. Use \"v2\\' for V2.\\n    .build()\\n)Module switchingAlternatively to using@config.when, you can instead place your different prompt implementations into different Python modules. Then, atDriverconstruction time, pass the correct module for the context you want to use.So here we have one module housing V1 of our prompt:# prompts_v1.py\\ndef summarize_chunk_of_text_prompt() -> str:\\n    \"\"\"V1 prompt for summarizing chunks of text.\"\"\"\\n    return f\"Summarize this text. Extract any key points with reasoning.\\\\n\\\\nContent:\"Here we have one module housing V2 (see how they differ slightly):# prompts_v2.py\\ndef summarize_chunk_of_text_prompt(content_type: str = \"an academic paper\") -> str:\\n    \"\"\"V2 prompt for summarizing chunks of text.\"\"\"\\n    return f\"Summarize this text from {content_type}. Extract the key points with reasoning. \\\\n\\\\nContent:\"In the driver code below, we choose the right module to use based on some context.# run.py\\nfrom hamilton import driver\\nimport summarization\\nimport prompts_v1\\nimport prompts_v2\\n\\n# create driver -- passing in the right module we want\\ndr = (\\n    driver.Builder()\\n    .with_modules(\\n        prompts_v1,  # or prompts_v2\\n        summarization,\\n    )\\n    .build()\\n)Using the module approach allows us to encapsulate and version whole sets of prompts together. If you want to go back in time (via git), or see what a blessed prompt version was, you just need to navigate to the correct commit, and then look in the right module.How would I log prompts used and monitor flows?Assuming you’re using git to track your code, you wouldn’t need to record what prompts were being used. Instead, you’d just need to know what git commit SHA is deployed and you’ll be able to track the version of your code and prompts simultaneously.To monitor flows, just like the above approach, you have the same monitoring hooks available at your disposal, and I wont repeat them here, but they are:Request any intermediate outputs and log them yourself outside of Hamilton.Log them from within the function yourself, or build aPython decorator/GraphAdapterto do it at the framework level.Integrate 3rd party tooling for monitoring your code and LLM API calls, or use the DAGWorks Platform offering to monitor it all. (Try the free tierhere)!or all the above!What about A/B testing my prompts?With any ML initiative, it’s important to measure business impacts of changes. Likewise, with LLMs + prompts, it’ll be important to test and measure changes against important business metrics. In the MLOps world, you’d be A/B testing ML models to evaluate their business value by dividing traffic between them. To ensure the randomness necessary to A/B tests, you wouldn’t know at runtime which model to use until a coin is flipped. However, to get those models out, they both would have follow a process to qualify them. So for prompts, we should think similarly.The above two prompt engineering patterns don’t preclude you from being able to A/B test prompts, but it means you need to manage a process to enable however many prompt templates you’re testing in parallel. If you’re also adjusting code paths, having them in code will be simpler to discern and debug what is going on, and you can make use of the `@config.when` decorator / python module swapping for this purpose. Versus, having to critically rely on your logging/monitoring/observability stack to tell you what prompt was used if you’re dynamically loading/passing them in and then having to mentally map which prompts go with which code paths.Note, this all gets harder if you start needing to change multiple prompts for an A/B test because you have several of them in a flow. For example you have two prompts in your workflow and you’re changing LLMs, you’ll want to A/B test the change holistically, rather than individually per prompt. Our advice, by putting the prompts into code your operational life will be simpler, since you’ll know what two prompts belong to what code paths without having to do any mental mapping.Thank you for reading DAGWorks’s Substack. This post is public so feel free to share it.ShareSummaryIn this post, we explained two patterns for managing prompts in a production environment with Hamilton. The first approach treatsprompts asdynamic runtime variables,while the second, treatsprompts as codefor production settings. If you value reducing operational burden, then our advice is to encode prompts as code, as it is operationally simpler, unless the speed to change them really matters for you.To recap:Prompts as dynamic runtime variables. Use an external system to pass the prompts to your Hamilton dataflows, or use Hamilton to pull them from a DB. For debugging & monitoring, it’s important to be able to determine what prompt was used for a given invocation. You can integrate open source tools, or use something like the DAGWorks Platform to help ensure you know what was used for any invocation of your code.Prompts as code.Encoding the prompts as code allows easy versioning with git. Change management can be done via pull requests and CI/CD checks. It works well with Hamilton’s features like@config.whenand module switching at the Driver level because it determines clearly what version of the prompt is used. This approach strengthens the use of any tooling you might use to monitor or track, like the DAGWorks Platform, as prompts for a deployment are immutable.We want to hear from you!If you’re excited by any of this, or have strong opinions, leave a comment, or drop by our Slack channel! Some links to do praise/complain/chat:📣join our community on Slack—\\u200awe’re more than happy to help answer questions you might have or get you started.⭐️ us onGitHub.📝 leave us anissueif you find something.📚 read ourdocumentation.⌨️ interactivelylearn about Hamilton in your browser.Other Hamilton posts you might be interested in:We have a growing collection of posts & content. Here are some we think you might be interested in.Containerized PDF Summarizer with FastAPI and HamiltonThierry Jean,DAGWorks Inc., andStefan Krawczyk·Aug 18Skip learning convoluted LLM-specific frameworks and write your first LLM application using regular Python functions and Hamilton! In this post, we’ll present a containerized PDF summarizer powered by the OpenAI API. Its flow is encoded in Hamilton, which theRead full storyBuilding a maintainable and modular LLM application stack with HamiltonThierry JeanandDAGWorks Inc.·Jul 11In this post, we’re going to share how Hamilton can help you write modular and maintainable code for your large language model (LLM) application stack. Hamilton is great for describing any type of dataflow, which is exactly what you’re doing when building an LLM powered application. With Hamilton you get strongRead full storytryhamilton.dev– an interactive tutorial in your browser!Hamilton + Airflow(GitHub repo)Hamilton + Feast(GitHub repo)Pandas data transformations in Hamilton in 5 minutesLineage + Hamilton in 10 minutes4Share this postLLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.ioCopy linkFacebookEmailNoteOtherShareCommentsTopNewNo postsReady for more?Subscribe Share this postLLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.ioCopy linkFacebookEmailNoteOtherDiscover more from DAGWorks’s SubstackThought posts, and updates on Hamilton and the DAGWorks Platform.SubscribeContinue readingSign inLLMOps: Production prompt engineering patterns with HamiltonAn overview of the production grade ways to iterate on prompts with Hamilton.DAGWorks Inc.,Stefan Krawczyk, andThierry JeanSep 6, 20234Share this postLLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.ioCopy linkFacebookEmailNoteOtherShareWhat you send to your LLM is quite important. Small variations and changes can have large impacts on outputs, so as your product evolves, the need to evolve your prompts will too. LLMs are also constantly being developed and released, and so as LLMs change, your prompts will also need to change. Therefore it’s important to set up an iteration pattern to operationalize how you “deploy” your prompts so you and your team can move efficiently, but also ensure that production issues are minimized, if not avoided. In this post, we’ll guide you through the best practices of managing prompts with Hamilton, making analogies to MLOps patterns, and discussing trade-offs along the way.Notes:(1): if you’re looking for a post that talks about “context management” this isn’t that post. But it is the post that will help you with the nuts and bolts on how to iterate and create that production grade “prompt context management” iteration story.(2): we’ll use prompt & prompt template interchangeably.(3): we’ll assume an “online” web-service setting is where these prompts are being used.(4): we’ll be using ourHamilton’s PDF summarizer exampleto project our patterns onto.(5): not familiar withHamilton? You can either learn about Hamilton viaTry Hamiltonand come back, or get the high level LLMOps approach from this post and then dig into Hamilton via thePDF Summarizer example.(6): what’s our credibility here? We’ve spent our careers building self-service data/MLOps tooling, most famously for Stitch Fix’s 100+ Data Scientists. So we’ve seen our share of outages and approaches play out over time.Thanks for reading DAGWorks’s Substack! Subscribe for free to receive updates and posts like this!SubscribePrompts are to LLMs what hyper-parameters are to ML modelsPoint:Prompts + LLM APIs are analogous to hyper-parameters + machine learning models.In terms of “Ops” practices, LLMOps is still in its infancy. MLOps is a little older, but still neither are widely adopted if you’re comparing it to how widespread knowledge is around DevOps practices.DevOps practices largely concern themselves with how you ship code to production, and MLOps practices how to ship code& data artifacts(e.g., statistical models)to production. So what about LLMOps? Personally, I think it’s closer to MLOps since you have:your LLM workflow is simply code.and an LLM API is a data artifact that can be “tweaked” using prompts, similar to a machine learning (ML) model and its hyper-parameters.Therefore, you most likely care about versioning the LLM API + prompts together tightly for good production practices. For instance, in MLOps practice,  you’d want a process in place to validate your ML model still behaves correctly whenever its hyper-parameters are changed.How should you think about operationalizing a prompt?To be clear, the two parts to control for are theLLMand theprompts. Much like  MLOps, when the code or the model artifact changes, you want to be able to determine which did. For LLMOps, we’ll want the same discernment, separating the LLM workflow from the LLM API + prompts. Importantly, we should consider LLMs (self-hosted or APIs) to be mostly static since we less frequently update (or even control) their internals. So, changing thepromptspart of LLM API + prompts is effectively like creating a new model artifact.There are two main ways to treat prompts:Prompts as dynamic runtime variables. The template used isn’t static to a deployment.Prompts as code.The prompt template is static/ predetermined given a deployment.The main difference is the amount of moving parts you need to manage to ensure a great production story. Below, we dig into how to use Hamilton in the context of these two approaches.Prompts as dynamic runtime variablesDynamically Pass/Load PromptsPrompts are just strings. Since strings are a primitive type in most languages, this means that they are quite easy to pass around.\\xa0 The idea is to abstract your code so that at runtime you pass in the prompts required.\\xa0 More concretely, you’d “load/reload” prompt templates whenever there’s an “updated” one.The MLOps analogy here, would be to auto-reload the ML model artifact (e.g., a pkl file) whenever a new model is available.MLOps Analogy: diagram showing how ML model auto reloading would look.Diagram showing what dynamically reloading/querying prompts would look like.The benefit here is that you can very quickly roll out new prompts because you do not need to redeploy your application!The downside to this iteration speed is increased operational burden:To someone monitoring your application, it’ll be unclear when the change occurred and whether it’s propagated itself through your systems. For example, you just pushed a new prompt, and the LLM now returns more tokens per request, causing latency to spike; whoever is monitoring will likely be puzzled, unless you have a great change log culture.Rollback semantics involve having to know aboutanothersystem. You can’t just rollback a prior deployment to fix things.You’ll need great monitoring to understand what was run and when; e.g., when customer service gives you a ticket to investigate, how do you know what prompt was in use?You’ll need to manage and monitor whatever system you’re using to manage and store your prompts. This will be an extra system you’ll need to maintain outside of whatever is serving your code.You’ll need to manage two processes, one for updating and pushing the service, and one for updating and pushing prompts. Synchronizing these changes will be on you. For example, you need to make a code change to your service to handle a new prompt. You will need to coordinate changing two systems to make it work, which is extra operational overhead to manage.How it would work with HamiltonOur PDF summarizer flow would look something like this if you removesummarize_text_from_summaries_promptandsummarize_chunk_of_text_promptfunction definitions:summarization_shortened.py. Note the two inputs “*_prompt” that denote prompts that are now required as input to the dataflow to function. With Hamilton you’ll be able to determine what inputs should be required for your prompt template by just looking at a diagram like this. Diagram created via Hamilton.To operate things, you’ll want to either inject the prompts at request time:from hamilton import base, driver\\nimport summarization_shortend\\n\\n# create driver\\ndr = (\\n    driver.Builder()\\n    .with_modules(summarization_sortened)\\n    .build()\\n)\\n\\n# pull prompts from somewhere\\nsummarize_chunk_of_text_prompt = \"\"\"SOME PROMPT FOR {chunked_text}\"\"\"\\nsummarize_text_from_summaries_prompt = \"\"\"SOME PROMPT {summarized_chunks} ... {user_query}\"\"\"\\n\\n# execute, and pass in the prompt \\nresult = dr.execute(\\n   [\"summarized_text\"],\\n   inputs={\\n      \"summarize_chunk_of_text_prompt\": summarize_chunk_of_text_prompt,\\n      ...\\n   }\\n)Or\\xa0you change your code to dynamically load prompts, i.e., add functions to retrieve prompts from an external system as part of the Hamilton dataflow. At each invocation, they will query for the prompt to use (you can of course cache this for performance):# prompt_template_loaders.py\\n\\ndef summarize_chunk_of_text_prompt(\\n  db_client: Client, other_args: str) -> str:\\n    # pseudo code here, but you get the idea:\\n    _prompt = db_client.query( \\n         \"get latest prompt X from DB\", other_args)\\n    return _prompt\\n\\ndef summarize_text_from_summaries_prompt(\\n   db_client: Client, another_arg: str) -> str:\\n    # pseudo code here, but you get the idea:\\n    _prompt = db_client.query(\\n         \"get latest prompt Y from DB\", another_arg)\\n    return _promptDriver code:from hamilton import base, driver\\nimport prompt_template_loaders # <-- load this to provide prompt input\\nimport summarization_shortend\\n\\n# create driver\\ndr = (\\n    driver.Builder()\\n    .with_modules(\\n        prompt_template_loaders,# <-- Hamilton will call above functions\\n        summarization_sortened, \\n    )\\n    .build()\\n)\\n\\n# execute, and pass in the prompt \\nresult = dr.execute(\\n   [\"summarized_text\"],\\n   inputs={\\n      # don\\'t need to pass prompts in this version\\n   }\\n)How would I log prompts used and monitor flows?Here we outline a few ways to monitor what went on.Log results of execution. That is run Hamilton, then emit information to wherever you want it to go.result = dr.execute(\\n   [\"summarized_text\", \\n    \"summarize_chunk_of_text_prompt\",   \\n    ... # and anything else you want to pull out\\n    \"summarize_text_from_summaries_prompt\"],\\n   inputs={\\n      # don\\'t need to pass prompts in this version\\n   }\\n)\\n\\nmy_log_system(result) # send what you want for safe keeping to some\\n                      # system that you own.Note. In the above, Hamilton allows you to requestanyintermediateoutputs simply by requesting “functions” (i.e. nodes in the diagram) by name. If we really want to get all the intermediate outputs of the entire dataflow, we can do so and log it wherever we want to!Use loggers inside Hamilton functions (to see the power of this approach,see my old talk on structured logs):import logging\\n\\nlogger = logging.getLogger(__name__)\\n\\ndef summarize_text_from_summaries_prompt(\\n    db_client: Client, another_arg: str) -> str:\\n    # pseudo code here, but you get the idea:\\n    _prompt = db_client.query(\\n         \"get latest prompt Y from DB\", another_arg)\\n    logger.info(f\"Prompt used is [{_prompt}]\")\\n    return _promptExtend Hamilton to emit this information. You use Hamilton to capture information from executed functions, i.e. nodes, without needing to insert logging statement inside the function’s body. This promotes reusability since you can toggle logging between development and production settings at the Driver level. SeeGraphAdapters, or write your ownPython decoratorto wrap functions for monitoring.In any of the above code, you could easily pull in a 3rd party tool to help track & monitor the code, as well as the external API call, e.g. data dog. Note, with a one-line code change, you can plug in the DAGWorks’s Driver and get all that monitoring you’d want and more. (Try the free tierhere)!Prompts as codePrompts as static stringsSince prompts are simply strings, they’re also very amenable to being stored along with your source code. The idea is to store as many prompt versions as you like, within your code so that at runtime, the set of prompts available is fixed and deterministic.The MLOps analogy here is, instead of dynamically reloading models, you instead bake the ML model into the container/hard code the reference. Once deployed, your app has everything that it needs. The deployment is immutable; nothing changes once it’s up. This makes debugging & determining what’s going on, much simpler.MLOps Analogy: make an immutable deployment by making the model fixed for your app’s deployment.Diagram showing how treating prompts as code enables you to leverage your CI/CD and build an immutable deployment for talking to your LLM API.This approach has many operational benefits:Whenever a new prompt is pushed, it forces a new deployment. Rollback semantics are clear if there’s an issue with a new prompt.You can submit a pull request (PR) for the source code and prompts at the same time. It becomes simpler to review what the change is, and the downstream dependencies of what these prompts will touch/interact with.You can add checks to your CI/CD system to ensure bad prompts don’t make it to production.It’s simpler to debug an issue. You just pull the (Docker) container that was created and you’ll be able to exactly replicate any customer issue quickly and easily.There is no other “prompt system” to maintain or manage. Simplifying operations.It doesn’t preclude adding extra monitoring and visibility.How it would work with HamiltonThe prompts would be encoded into functions into the dataflow/directed acyclic graph (DAG):What summarization.py in the PDF summarizer example looks like. The prompt templates are part of the code. Diagram created via Hamilton.Pairing this code withgit, we have a lightweight versioning system for your entire dataflow (i.e. “chain”), so you can always discern what state the world was in, given a git commit SHA. If you want to manage and have access to multiple prompts at any given point in time, Hamilton has two powerful abstractions to enable you to do so:@config.whenandPython modules. This allows you to store and keep available all older prompt versions together and specify which one to use via code.@config.when (docs)Hamilton has a concept of decorators, which are just annotations on functions. The@config.whendecorator allows to specify alternative implementations for a functions, i.e. “node”, in your dataflow. In this case, we specify alternative prompts.from hamilton.function_modifiers import config\\n\\n@config.when(version=\"v1\")\\ndef summarize_chunk_of_text_prompt__v1() -> str:\\n    \"\"\"V1 prompt for summarizing chunks of text.\"\"\"\\n    return f\"Summarize this text. Extract any key points with reasoning.\\\\n\\\\nContent:\"\\n\\n@config.when(version=\"v2\")\\ndef summarize_chunk_of_text_prompt__v2(content_type: str = \"an academic paper\") -> str:\\n    \"\"\"V2 prompt for summarizing chunks of text.\"\"\"\\n    return f\"Summarize this text from {content_type}. Extract the key points with reasoning. \\\\n\\\\nContent:\"You can keep adding functions annotated with@config.when, allowing you to swap between them using configuration passed to the HamiltonDriver. When instantiating theDriver,  it will construct the dataflow using the prompt implementation associated with the configuration value.from hamilton import base, driver\\nimport summarization\\n\\n# create driver\\ndr = (\\n    driver.Builder()\\n    .with_modules(summarization)\\n    .with_config({\"version\": \"v1\"}) # V1 is chosen. Use \"v2\\' for V2.\\n    .build()\\n)Module switchingAlternatively to using@config.when, you can instead place your different prompt implementations into different Python modules. Then, atDriverconstruction time, pass the correct module for the context you want to use.So here we have one module housing V1 of our prompt:# prompts_v1.py\\ndef summarize_chunk_of_text_prompt() -> str:\\n    \"\"\"V1 prompt for summarizing chunks of text.\"\"\"\\n    return f\"Summarize this text. Extract any key points with reasoning.\\\\n\\\\nContent:\"Here we have one module housing V2 (see how they differ slightly):# prompts_v2.py\\ndef summarize_chunk_of_text_prompt(content_type: str = \"an academic paper\") -> str:\\n    \"\"\"V2 prompt for summarizing chunks of text.\"\"\"\\n    return f\"Summarize this text from {content_type}. Extract the key points with reasoning. \\\\n\\\\nContent:\"In the driver code below, we choose the right module to use based on some context.# run.py\\nfrom hamilton import driver\\nimport summarization\\nimport prompts_v1\\nimport prompts_v2\\n\\n# create driver -- passing in the right module we want\\ndr = (\\n    driver.Builder()\\n    .with_modules(\\n        prompts_v1,  # or prompts_v2\\n        summarization,\\n    )\\n    .build()\\n)Using the module approach allows us to encapsulate and version whole sets of prompts together. If you want to go back in time (via git), or see what a blessed prompt version was, you just need to navigate to the correct commit, and then look in the right module.How would I log prompts used and monitor flows?Assuming you’re using git to track your code, you wouldn’t need to record what prompts were being used. Instead, you’d just need to know what git commit SHA is deployed and you’ll be able to track the version of your code and prompts simultaneously.To monitor flows, just like the above approach, you have the same monitoring hooks available at your disposal, and I wont repeat them here, but they are:Request any intermediate outputs and log them yourself outside of Hamilton.Log them from within the function yourself, or build aPython decorator/GraphAdapterto do it at the framework level.Integrate 3rd party tooling for monitoring your code and LLM API calls, or use the DAGWorks Platform offering to monitor it all. (Try the free tierhere)!or all the above!What about A/B testing my prompts?With any ML initiative, it’s important to measure business impacts of changes. Likewise, with LLMs + prompts, it’ll be important to test and measure changes against important business metrics. In the MLOps world, you’d be A/B testing ML models to evaluate their business value by dividing traffic between them. To ensure the randomness necessary to A/B tests, you wouldn’t know at runtime which model to use until a coin is flipped. However, to get those models out, they both would have follow a process to qualify them. So for prompts, we should think similarly.The above two prompt engineering patterns don’t preclude you from being able to A/B test prompts, but it means you need to manage a process to enable however many prompt templates you’re testing in parallel. If you’re also adjusting code paths, having them in code will be simpler to discern and debug what is going on, and you can make use of the `@config.when` decorator / python module swapping for this purpose. Versus, having to critically rely on your logging/monitoring/observability stack to tell you what prompt was used if you’re dynamically loading/passing them in and then having to mentally map which prompts go with which code paths.Note, this all gets harder if you start needing to change multiple prompts for an A/B test because you have several of them in a flow. For example you have two prompts in your workflow and you’re changing LLMs, you’ll want to A/B test the change holistically, rather than individually per prompt. Our advice, by putting the prompts into code your operational life will be simpler, since you’ll know what two prompts belong to what code paths without having to do any mental mapping.Thank you for reading DAGWorks’s Substack. This post is public so feel free to share it.ShareSummaryIn this post, we explained two patterns for managing prompts in a production environment with Hamilton. The first approach treatsprompts asdynamic runtime variables,while the second, treatsprompts as codefor production settings. If you value reducing operational burden, then our advice is to encode prompts as code, as it is operationally simpler, unless the speed to change them really matters for you.To recap:Prompts as dynamic runtime variables. Use an external system to pass the prompts to your Hamilton dataflows, or use Hamilton to pull them from a DB. For debugging & monitoring, it’s important to be able to determine what prompt was used for a given invocation. You can integrate open source tools, or use something like the DAGWorks Platform to help ensure you know what was used for any invocation of your code.Prompts as code.Encoding the prompts as code allows easy versioning with git. Change management can be done via pull requests and CI/CD checks. It works well with Hamilton’s features like@config.whenand module switching at the Driver level because it determines clearly what version of the prompt is used. This approach strengthens the use of any tooling you might use to monitor or track, like the DAGWorks Platform, as prompts for a deployment are immutable.We want to hear from you!If you’re excited by any of this, or have strong opinions, leave a comment, or drop by our Slack channel! Some links to do praise/complain/chat:📣join our community on Slack—\\u200awe’re more than happy to help answer questions you might have or get you started.⭐️ us onGitHub.📝 leave us anissueif you find something.📚 read ourdocumentation.⌨️ interactivelylearn about Hamilton in your browser.Other Hamilton posts you might be interested in:We have a growing collection of posts & content. Here are some we think you might be interested in.Containerized PDF Summarizer with FastAPI and HamiltonThierry Jean,DAGWorks Inc., andStefan Krawczyk·Aug 18Skip learning convoluted LLM-specific frameworks and write your first LLM application using regular Python functions and Hamilton! In this post, we’ll present a containerized PDF summarizer powered by the OpenAI API. Its flow is encoded in Hamilton, which theRead full storyBuilding a maintainable and modular LLM application stack with HamiltonThierry JeanandDAGWorks Inc.·Jul 11In this post, we’re going to share how Hamilton can help you write modular and maintainable code for your large language model (LLM) application stack. Hamilton is great for describing any type of dataflow, which is exactly what you’re doing when building an LLM powered application. With Hamilton you get strongRead full storytryhamilton.dev– an interactive tutorial in your browser!Hamilton + Airflow(GitHub repo)Hamilton + Feast(GitHub repo)Pandas data transformations in Hamilton in 5 minutesLineage + Hamilton in 10 minutes4Share this postLLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.ioCopy linkFacebookEmailNoteOtherShareCommentsTopNewNo postsReady for more?Subscribe Share this postLLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.ioCopy linkFacebookEmailNoteOtherDiscover more from DAGWorks’s SubstackThought posts, and updates on Hamilton and the DAGWorks Platform.SubscribeContinue readingSign inLLMOps: Production prompt engineering patterns with HamiltonAn overview of the production grade ways to iterate on prompts with Hamilton.DAGWorks Inc.,Stefan Krawczyk, andThierry JeanSep 6, 20234Share this postLLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.ioCopy linkFacebookEmailNoteOtherShareWhat you send to your LLM is quite important. Small variations and changes can have large impacts on outputs, so as your product evolves, the need to evolve your prompts will too. LLMs are also constantly being developed and released, and so as LLMs change, your prompts will also need to change. Therefore it’s important to set up an iteration pattern to operationalize how you “deploy” your prompts so you and your team can move efficiently, but also ensure that production issues are minimized, if not avoided. In this post, we’ll guide you through the best practices of managing prompts with Hamilton, making analogies to MLOps patterns, and discussing trade-offs along the way.Notes:(1): if you’re looking for a post that talks about “context management” this isn’t that post. But it is the post that will help you with the nuts and bolts on how to iterate and create that production grade “prompt context management” iteration story.(2): we’ll use prompt & prompt template interchangeably.(3): we’ll assume an “online” web-service setting is where these prompts are being used.(4): we’ll be using ourHamilton’s PDF summarizer exampleto project our patterns onto.(5): not familiar withHamilton? You can either learn about Hamilton viaTry Hamiltonand come back, or get the high level LLMOps approach from this post and then dig into Hamilton via thePDF Summarizer example.(6): what’s our credibility here? We’ve spent our careers building self-service data/MLOps tooling, most famously for Stitch Fix’s 100+ Data Scientists. So we’ve seen our share of outages and approaches play out over time.Thanks for reading DAGWorks’s Substack! Subscribe for free to receive updates and posts like this!SubscribePrompts are to LLMs what hyper-parameters are to ML modelsPoint:Prompts + LLM APIs are analogous to hyper-parameters + machine learning models.In terms of “Ops” practices, LLMOps is still in its infancy. MLOps is a little older, but still neither are widely adopted if you’re comparing it to how widespread knowledge is around DevOps practices.DevOps practices largely concern themselves with how you ship code to production, and MLOps practices how to ship code& data artifacts(e.g., statistical models)to production. So what about LLMOps? Personally, I think it’s closer to MLOps since you have:your LLM workflow is simply code.and an LLM API is a data artifact that can be “tweaked” using prompts, similar to a machine learning (ML) model and its hyper-parameters.Therefore, you most likely care about versioning the LLM API + prompts together tightly for good production practices. For instance, in MLOps practice,  you’d want a process in place to validate your ML model still behaves correctly whenever its hyper-parameters are changed.How should you think about operationalizing a prompt?To be clear, the two parts to control for are theLLMand theprompts. Much like  MLOps, when the code or the model artifact changes, you want to be able to determine which did. For LLMOps, we’ll want the same discernment, separating the LLM workflow from the LLM API + prompts. Importantly, we should consider LLMs (self-hosted or APIs) to be mostly static since we less frequently update (or even control) their internals. So, changing thepromptspart of LLM API + prompts is effectively like creating a new model artifact.There are two main ways to treat prompts:Prompts as dynamic runtime variables. The template used isn’t static to a deployment.Prompts as code.The prompt template is static/ predetermined given a deployment.The main difference is the amount of moving parts you need to manage to ensure a great production story. Below, we dig into how to use Hamilton in the context of these two approaches.Prompts as dynamic runtime variablesDynamically Pass/Load PromptsPrompts are just strings. Since strings are a primitive type in most languages, this means that they are quite easy to pass around.\\xa0 The idea is to abstract your code so that at runtime you pass in the prompts required.\\xa0 More concretely, you’d “load/reload” prompt templates whenever there’s an “updated” one.The MLOps analogy here, would be to auto-reload the ML model artifact (e.g., a pkl file) whenever a new model is available.MLOps Analogy: diagram showing how ML model auto reloading would look.Diagram showing what dynamically reloading/querying prompts would look like.The benefit here is that you can very quickly roll out new prompts because you do not need to redeploy your application!The downside to this iteration speed is increased operational burden:To someone monitoring your application, it’ll be unclear when the change occurred and whether it’s propagated itself through your systems. For example, you just pushed a new prompt, and the LLM now returns more tokens per request, causing latency to spike; whoever is monitoring will likely be puzzled, unless you have a great change log culture.Rollback semantics involve having to know aboutanothersystem. You can’t just rollback a prior deployment to fix things.You’ll need great monitoring to understand what was run and when; e.g., when customer service gives you a ticket to investigate, how do you know what prompt was in use?You’ll need to manage and monitor whatever system you’re using to manage and store your prompts. This will be an extra system you’ll need to maintain outside of whatever is serving your code.You’ll need to manage two processes, one for updating and pushing the service, and one for updating and pushing prompts. Synchronizing these changes will be on you. For example, you need to make a code change to your service to handle a new prompt. You will need to coordinate changing two systems to make it work, which is extra operational overhead to manage.How it would work with HamiltonOur PDF summarizer flow would look something like this if you removesummarize_text_from_summaries_promptandsummarize_chunk_of_text_promptfunction definitions:summarization_shortened.py. Note the two inputs “*_prompt” that denote prompts that are now required as input to the dataflow to function. With Hamilton you’ll be able to determine what inputs should be required for your prompt template by just looking at a diagram like this. Diagram created via Hamilton.To operate things, you’ll want to either inject the prompts at request time:from hamilton import base, driver\\nimport summarization_shortend\\n\\n# create driver\\ndr = (\\n    driver.Builder()\\n    .with_modules(summarization_sortened)\\n    .build()\\n)\\n\\n# pull prompts from somewhere\\nsummarize_chunk_of_text_prompt = \"\"\"SOME PROMPT FOR {chunked_text}\"\"\"\\nsummarize_text_from_summaries_prompt = \"\"\"SOME PROMPT {summarized_chunks} ... {user_query}\"\"\"\\n\\n# execute, and pass in the prompt \\nresult = dr.execute(\\n   [\"summarized_text\"],\\n   inputs={\\n      \"summarize_chunk_of_text_prompt\": summarize_chunk_of_text_prompt,\\n      ...\\n   }\\n)Or\\xa0you change your code to dynamically load prompts, i.e., add functions to retrieve prompts from an external system as part of the Hamilton dataflow. At each invocation, they will query for the prompt to use (you can of course cache this for performance):# prompt_template_loaders.py\\n\\ndef summarize_chunk_of_text_prompt(\\n  db_client: Client, other_args: str) -> str:\\n    # pseudo code here, but you get the idea:\\n    _prompt = db_client.query( \\n         \"get latest prompt X from DB\", other_args)\\n    return _prompt\\n\\ndef summarize_text_from_summaries_prompt(\\n   db_client: Client, another_arg: str) -> str:\\n    # pseudo code here, but you get the idea:\\n    _prompt = db_client.query(\\n         \"get latest prompt Y from DB\", another_arg)\\n    return _promptDriver code:from hamilton import base, driver\\nimport prompt_template_loaders # <-- load this to provide prompt input\\nimport summarization_shortend\\n\\n# create driver\\ndr = (\\n    driver.Builder()\\n    .with_modules(\\n        prompt_template_loaders,# <-- Hamilton will call above functions\\n        summarization_sortened, \\n    )\\n    .build()\\n)\\n\\n# execute, and pass in the prompt \\nresult = dr.execute(\\n   [\"summarized_text\"],\\n   inputs={\\n      # don\\'t need to pass prompts in this version\\n   }\\n)How would I log prompts used and monitor flows?Here we outline a few ways to monitor what went on.Log results of execution. That is run Hamilton, then emit information to wherever you want it to go.result = dr.execute(\\n   [\"summarized_text\", \\n    \"summarize_chunk_of_text_prompt\",   \\n    ... # and anything else you want to pull out\\n    \"summarize_text_from_summaries_prompt\"],\\n   inputs={\\n      # don\\'t need to pass prompts in this version\\n   }\\n)\\n\\nmy_log_system(result) # send what you want for safe keeping to some\\n                      # system that you own.Note. In the above, Hamilton allows you to requestanyintermediateoutputs simply by requesting “functions” (i.e. nodes in the diagram) by name. If we really want to get all the intermediate outputs of the entire dataflow, we can do so and log it wherever we want to!Use loggers inside Hamilton functions (to see the power of this approach,see my old talk on structured logs):import logging\\n\\nlogger = logging.getLogger(__name__)\\n\\ndef summarize_text_from_summaries_prompt(\\n    db_client: Client, another_arg: str) -> str:\\n    # pseudo code here, but you get the idea:\\n    _prompt = db_client.query(\\n         \"get latest prompt Y from DB\", another_arg)\\n    logger.info(f\"Prompt used is [{_prompt}]\")\\n    return _promptExtend Hamilton to emit this information. You use Hamilton to capture information from executed functions, i.e. nodes, without needing to insert logging statement inside the function’s body. This promotes reusability since you can toggle logging between development and production settings at the Driver level. SeeGraphAdapters, or write your ownPython decoratorto wrap functions for monitoring.In any of the above code, you could easily pull in a 3rd party tool to help track & monitor the code, as well as the external API call, e.g. data dog. Note, with a one-line code change, you can plug in the DAGWorks’s Driver and get all that monitoring you’d want and more. (Try the free tierhere)!Prompts as codePrompts as static stringsSince prompts are simply strings, they’re also very amenable to being stored along with your source code. The idea is to store as many prompt versions as you like, within your code so that at runtime, the set of prompts available is fixed and deterministic.The MLOps analogy here is, instead of dynamically reloading models, you instead bake the ML model into the container/hard code the reference. Once deployed, your app has everything that it needs. The deployment is immutable; nothing changes once it’s up. This makes debugging & determining what’s going on, much simpler.MLOps Analogy: make an immutable deployment by making the model fixed for your app’s deployment.Diagram showing how treating prompts as code enables you to leverage your CI/CD and build an immutable deployment for talking to your LLM API.This approach has many operational benefits:Whenever a new prompt is pushed, it forces a new deployment. Rollback semantics are clear if there’s an issue with a new prompt.You can submit a pull request (PR) for the source code and prompts at the same time. It becomes simpler to review what the change is, and the downstream dependencies of what these prompts will touch/interact with.You can add checks to your CI/CD system to ensure bad prompts don’t make it to production.It’s simpler to debug an issue. You just pull the (Docker) container that was created and you’ll be able to exactly replicate any customer issue quickly and easily.There is no other “prompt system” to maintain or manage. Simplifying operations.It doesn’t preclude adding extra monitoring and visibility.How it would work with HamiltonThe prompts would be encoded into functions into the dataflow/directed acyclic graph (DAG):What summarization.py in the PDF summarizer example looks like. The prompt templates are part of the code. Diagram created via Hamilton.Pairing this code withgit, we have a lightweight versioning system for your entire dataflow (i.e. “chain”), so you can always discern what state the world was in, given a git commit SHA. If you want to manage and have access to multiple prompts at any given point in time, Hamilton has two powerful abstractions to enable you to do so:@config.whenandPython modules. This allows you to store and keep available all older prompt versions together and specify which one to use via code.@config.when (docs)Hamilton has a concept of decorators, which are just annotations on functions. The@config.whendecorator allows to specify alternative implementations for a functions, i.e. “node”, in your dataflow. In this case, we specify alternative prompts.from hamilton.function_modifiers import config\\n\\n@config.when(version=\"v1\")\\ndef summarize_chunk_of_text_prompt__v1() -> str:\\n    \"\"\"V1 prompt for summarizing chunks of text.\"\"\"\\n    return f\"Summarize this text. Extract any key points with reasoning.\\\\n\\\\nContent:\"\\n\\n@config.when(version=\"v2\")\\ndef summarize_chunk_of_text_prompt__v2(content_type: str = \"an academic paper\") -> str:\\n    \"\"\"V2 prompt for summarizing chunks of text.\"\"\"\\n    return f\"Summarize this text from {content_type}. Extract the key points with reasoning. \\\\n\\\\nContent:\"You can keep adding functions annotated with@config.when, allowing you to swap between them using configuration passed to the HamiltonDriver. When instantiating theDriver,  it will construct the dataflow using the prompt implementation associated with the configuration value.from hamilton import base, driver\\nimport summarization\\n\\n# create driver\\ndr = (\\n    driver.Builder()\\n    .with_modules(summarization)\\n    .with_config({\"version\": \"v1\"}) # V1 is chosen. Use \"v2\\' for V2.\\n    .build()\\n)Module switchingAlternatively to using@config.when, you can instead place your different prompt implementations into different Python modules. Then, atDriverconstruction time, pass the correct module for the context you want to use.So here we have one module housing V1 of our prompt:# prompts_v1.py\\ndef summarize_chunk_of_text_prompt() -> str:\\n    \"\"\"V1 prompt for summarizing chunks of text.\"\"\"\\n    return f\"Summarize this text. Extract any key points with reasoning.\\\\n\\\\nContent:\"Here we have one module housing V2 (see how they differ slightly):# prompts_v2.py\\ndef summarize_chunk_of_text_prompt(content_type: str = \"an academic paper\") -> str:\\n    \"\"\"V2 prompt for summarizing chunks of text.\"\"\"\\n    return f\"Summarize this text from {content_type}. Extract the key points with reasoning. \\\\n\\\\nContent:\"In the driver code below, we choose the right module to use based on some context.# run.py\\nfrom hamilton import driver\\nimport summarization\\nimport prompts_v1\\nimport prompts_v2\\n\\n# create driver -- passing in the right module we want\\ndr = (\\n    driver.Builder()\\n    .with_modules(\\n        prompts_v1,  # or prompts_v2\\n        summarization,\\n    )\\n    .build()\\n)Using the module approach allows us to encapsulate and version whole sets of prompts together. If you want to go back in time (via git), or see what a blessed prompt version was, you just need to navigate to the correct commit, and then look in the right module.How would I log prompts used and monitor flows?Assuming you’re using git to track your code, you wouldn’t need to record what prompts were being used. Instead, you’d just need to know what git commit SHA is deployed and you’ll be able to track the version of your code and prompts simultaneously.To monitor flows, just like the above approach, you have the same monitoring hooks available at your disposal, and I wont repeat them here, but they are:Request any intermediate outputs and log them yourself outside of Hamilton.Log them from within the function yourself, or build aPython decorator/GraphAdapterto do it at the framework level.Integrate 3rd party tooling for monitoring your code and LLM API calls, or use the DAGWorks Platform offering to monitor it all. (Try the free tierhere)!or all the above!What about A/B testing my prompts?With any ML initiative, it’s important to measure business impacts of changes. Likewise, with LLMs + prompts, it’ll be important to test and measure changes against important business metrics. In the MLOps world, you’d be A/B testing ML models to evaluate their business value by dividing traffic between them. To ensure the randomness necessary to A/B tests, you wouldn’t know at runtime which model to use until a coin is flipped. However, to get those models out, they both would have follow a process to qualify them. So for prompts, we should think similarly.The above two prompt engineering patterns don’t preclude you from being able to A/B test prompts, but it means you need to manage a process to enable however many prompt templates you’re testing in parallel. If you’re also adjusting code paths, having them in code will be simpler to discern and debug what is going on, and you can make use of the `@config.when` decorator / python module swapping for this purpose. Versus, having to critically rely on your logging/monitoring/observability stack to tell you what prompt was used if you’re dynamically loading/passing them in and then having to mentally map which prompts go with which code paths.Note, this all gets harder if you start needing to change multiple prompts for an A/B test because you have several of them in a flow. For example you have two prompts in your workflow and you’re changing LLMs, you’ll want to A/B test the change holistically, rather than individually per prompt. Our advice, by putting the prompts into code your operational life will be simpler, since you’ll know what two prompts belong to what code paths without having to do any mental mapping.Thank you for reading DAGWorks’s Substack. This post is public so feel free to share it.ShareSummaryIn this post, we explained two patterns for managing prompts in a production environment with Hamilton. The first approach treatsprompts asdynamic runtime variables,while the second, treatsprompts as codefor production settings. If you value reducing operational burden, then our advice is to encode prompts as code, as it is operationally simpler, unless the speed to change them really matters for you.To recap:Prompts as dynamic runtime variables. Use an external system to pass the prompts to your Hamilton dataflows, or use Hamilton to pull them from a DB. For debugging & monitoring, it’s important to be able to determine what prompt was used for a given invocation. You can integrate open source tools, or use something like the DAGWorks Platform to help ensure you know what was used for any invocation of your code.Prompts as code.Encoding the prompts as code allows easy versioning with git. Change management can be done via pull requests and CI/CD checks. It works well with Hamilton’s features like@config.whenand module switching at the Driver level because it determines clearly what version of the prompt is used. This approach strengthens the use of any tooling you might use to monitor or track, like the DAGWorks Platform, as prompts for a deployment are immutable.We want to hear from you!If you’re excited by any of this, or have strong opinions, leave a comment, or drop by our Slack channel! Some links to do praise/complain/chat:📣join our community on Slack—\\u200awe’re more than happy to help answer questions you might have or get you started.⭐️ us onGitHub.📝 leave us anissueif you find something.📚 read ourdocumentation.⌨️ interactivelylearn about Hamilton in your browser.Other Hamilton posts you might be interested in:We have a growing collection of posts & content. Here are some we think you might be interested in.Containerized PDF Summarizer with FastAPI and HamiltonThierry Jean,DAGWorks Inc., andStefan Krawczyk·Aug 18Skip learning convoluted LLM-specific frameworks and write your first LLM application using regular Python functions and Hamilton! In this post, we’ll present a containerized PDF summarizer powered by the OpenAI API. Its flow is encoded in Hamilton, which theRead full storyBuilding a maintainable and modular LLM application stack with HamiltonThierry JeanandDAGWorks Inc.·Jul 11In this post, we’re going to share how Hamilton can help you write modular and maintainable code for your large language model (LLM) application stack. Hamilton is great for describing any type of dataflow, which is exactly what you’re doing when building an LLM powered application. With Hamilton you get strongRead full storytryhamilton.dev– an interactive tutorial in your browser!Hamilton + Airflow(GitHub repo)Hamilton + Feast(GitHub repo)Pandas data transformations in Hamilton in 5 minutesLineage + Hamilton in 10 minutes4Share this postLLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.ioCopy linkFacebookEmailNoteOtherShare Share this postLLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.ioCopy linkFacebookEmailNoteOtherDiscover more from DAGWorks’s SubstackThought posts, and updates on Hamilton and the DAGWorks Platform.SubscribeContinue readingSign inLLMOps: Production prompt engineering patterns with HamiltonAn overview of the production grade ways to iterate on prompts with Hamilton.DAGWorks Inc.,Stefan Krawczyk, andThierry JeanSep 6, 20234Share this postLLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.ioCopy linkFacebookEmailNoteOtherShareWhat you send to your LLM is quite important. Small variations and changes can have large impacts on outputs, so as your product evolves, the need to evolve your prompts will too. LLMs are also constantly being developed and released, and so as LLMs change, your prompts will also need to change. Therefore it’s important to set up an iteration pattern to operationalize how you “deploy” your prompts so you and your team can move efficiently, but also ensure that production issues are minimized, if not avoided. In this post, we’ll guide you through the best practices of managing prompts with Hamilton, making analogies to MLOps patterns, and discussing trade-offs along the way.Notes:(1): if you’re looking for a post that talks about “context management” this isn’t that post. But it is the post that will help you with the nuts and bolts on how to iterate and create that production grade “prompt context management” iteration story.(2): we’ll use prompt & prompt template interchangeably.(3): we’ll assume an “online” web-service setting is where these prompts are being used.(4): we’ll be using ourHamilton’s PDF summarizer exampleto project our patterns onto.(5): not familiar withHamilton? You can either learn about Hamilton viaTry Hamiltonand come back, or get the high level LLMOps approach from this post and then dig into Hamilton via thePDF Summarizer example.(6): what’s our credibility here? We’ve spent our careers building self-service data/MLOps tooling, most famously for Stitch Fix’s 100+ Data Scientists. So we’ve seen our share of outages and approaches play out over time.Thanks for reading DAGWorks’s Substack! Subscribe for free to receive updates and posts like this!SubscribePrompts are to LLMs what hyper-parameters are to ML modelsPoint:Prompts + LLM APIs are analogous to hyper-parameters + machine learning models.In terms of “Ops” practices, LLMOps is still in its infancy. MLOps is a little older, but still neither are widely adopted if you’re comparing it to how widespread knowledge is around DevOps practices.DevOps practices largely concern themselves with how you ship code to production, and MLOps practices how to ship code& data artifacts(e.g., statistical models)to production. So what about LLMOps? Personally, I think it’s closer to MLOps since you have:your LLM workflow is simply code.and an LLM API is a data artifact that can be “tweaked” using prompts, similar to a machine learning (ML) model and its hyper-parameters.Therefore, you most likely care about versioning the LLM API + prompts together tightly for good production practices. For instance, in MLOps practice,  you’d want a process in place to validate your ML model still behaves correctly whenever its hyper-parameters are changed.How should you think about operationalizing a prompt?To be clear, the two parts to control for are theLLMand theprompts. Much like  MLOps, when the code or the model artifact changes, you want to be able to determine which did. For LLMOps, we’ll want the same discernment, separating the LLM workflow from the LLM API + prompts. Importantly, we should consider LLMs (self-hosted or APIs) to be mostly static since we less frequently update (or even control) their internals. So, changing thepromptspart of LLM API + prompts is effectively like creating a new model artifact.There are two main ways to treat prompts:Prompts as dynamic runtime variables. The template used isn’t static to a deployment.Prompts as code.The prompt template is static/ predetermined given a deployment.The main difference is the amount of moving parts you need to manage to ensure a great production story. Below, we dig into how to use Hamilton in the context of these two approaches.Prompts as dynamic runtime variablesDynamically Pass/Load PromptsPrompts are just strings. Since strings are a primitive type in most languages, this means that they are quite easy to pass around.\\xa0 The idea is to abstract your code so that at runtime you pass in the prompts required.\\xa0 More concretely, you’d “load/reload” prompt templates whenever there’s an “updated” one.The MLOps analogy here, would be to auto-reload the ML model artifact (e.g., a pkl file) whenever a new model is available.MLOps Analogy: diagram showing how ML model auto reloading would look.Diagram showing what dynamically reloading/querying prompts would look like.The benefit here is that you can very quickly roll out new prompts because you do not need to redeploy your application!The downside to this iteration speed is increased operational burden:To someone monitoring your application, it’ll be unclear when the change occurred and whether it’s propagated itself through your systems. For example, you just pushed a new prompt, and the LLM now returns more tokens per request, causing latency to spike; whoever is monitoring will likely be puzzled, unless you have a great change log culture.Rollback semantics involve having to know aboutanothersystem. You can’t just rollback a prior deployment to fix things.You’ll need great monitoring to understand what was run and when; e.g., when customer service gives you a ticket to investigate, how do you know what prompt was in use?You’ll need to manage and monitor whatever system you’re using to manage and store your prompts. This will be an extra system you’ll need to maintain outside of whatever is serving your code.You’ll need to manage two processes, one for updating and pushing the service, and one for updating and pushing prompts. Synchronizing these changes will be on you. For example, you need to make a code change to your service to handle a new prompt. You will need to coordinate changing two systems to make it work, which is extra operational overhead to manage.How it would work with HamiltonOur PDF summarizer flow would look something like this if you removesummarize_text_from_summaries_promptandsummarize_chunk_of_text_promptfunction definitions:summarization_shortened.py. Note the two inputs “*_prompt” that denote prompts that are now required as input to the dataflow to function. With Hamilton you’ll be able to determine what inputs should be required for your prompt template by just looking at a diagram like this. Diagram created via Hamilton.To operate things, you’ll want to either inject the prompts at request time:from hamilton import base, driver\\nimport summarization_shortend\\n\\n# create driver\\ndr = (\\n    driver.Builder()\\n    .with_modules(summarization_sortened)\\n    .build()\\n)\\n\\n# pull prompts from somewhere\\nsummarize_chunk_of_text_prompt = \"\"\"SOME PROMPT FOR {chunked_text}\"\"\"\\nsummarize_text_from_summaries_prompt = \"\"\"SOME PROMPT {summarized_chunks} ... {user_query}\"\"\"\\n\\n# execute, and pass in the prompt \\nresult = dr.execute(\\n   [\"summarized_text\"],\\n   inputs={\\n      \"summarize_chunk_of_text_prompt\": summarize_chunk_of_text_prompt,\\n      ...\\n   }\\n)Or\\xa0you change your code to dynamically load prompts, i.e., add functions to retrieve prompts from an external system as part of the Hamilton dataflow. At each invocation, they will query for the prompt to use (you can of course cache this for performance):# prompt_template_loaders.py\\n\\ndef summarize_chunk_of_text_prompt(\\n  db_client: Client, other_args: str) -> str:\\n    # pseudo code here, but you get the idea:\\n    _prompt = db_client.query( \\n         \"get latest prompt X from DB\", other_args)\\n    return _prompt\\n\\ndef summarize_text_from_summaries_prompt(\\n   db_client: Client, another_arg: str) -> str:\\n    # pseudo code here, but you get the idea:\\n    _prompt = db_client.query(\\n         \"get latest prompt Y from DB\", another_arg)\\n    return _promptDriver code:from hamilton import base, driver\\nimport prompt_template_loaders # <-- load this to provide prompt input\\nimport summarization_shortend\\n\\n# create driver\\ndr = (\\n    driver.Builder()\\n    .with_modules(\\n        prompt_template_loaders,# <-- Hamilton will call above functions\\n        summarization_sortened, \\n    )\\n    .build()\\n)\\n\\n# execute, and pass in the prompt \\nresult = dr.execute(\\n   [\"summarized_text\"],\\n   inputs={\\n      # don\\'t need to pass prompts in this version\\n   }\\n)How would I log prompts used and monitor flows?Here we outline a few ways to monitor what went on.Log results of execution. That is run Hamilton, then emit information to wherever you want it to go.result = dr.execute(\\n   [\"summarized_text\", \\n    \"summarize_chunk_of_text_prompt\",   \\n    ... # and anything else you want to pull out\\n    \"summarize_text_from_summaries_prompt\"],\\n   inputs={\\n      # don\\'t need to pass prompts in this version\\n   }\\n)\\n\\nmy_log_system(result) # send what you want for safe keeping to some\\n                      # system that you own.Note. In the above, Hamilton allows you to requestanyintermediateoutputs simply by requesting “functions” (i.e. nodes in the diagram) by name. If we really want to get all the intermediate outputs of the entire dataflow, we can do so and log it wherever we want to!Use loggers inside Hamilton functions (to see the power of this approach,see my old talk on structured logs):import logging\\n\\nlogger = logging.getLogger(__name__)\\n\\ndef summarize_text_from_summaries_prompt(\\n    db_client: Client, another_arg: str) -> str:\\n    # pseudo code here, but you get the idea:\\n    _prompt = db_client.query(\\n         \"get latest prompt Y from DB\", another_arg)\\n    logger.info(f\"Prompt used is [{_prompt}]\")\\n    return _promptExtend Hamilton to emit this information. You use Hamilton to capture information from executed functions, i.e. nodes, without needing to insert logging statement inside the function’s body. This promotes reusability since you can toggle logging between development and production settings at the Driver level. SeeGraphAdapters, or write your ownPython decoratorto wrap functions for monitoring.In any of the above code, you could easily pull in a 3rd party tool to help track & monitor the code, as well as the external API call, e.g. data dog. Note, with a one-line code change, you can plug in the DAGWorks’s Driver and get all that monitoring you’d want and more. (Try the free tierhere)!Prompts as codePrompts as static stringsSince prompts are simply strings, they’re also very amenable to being stored along with your source code. The idea is to store as many prompt versions as you like, within your code so that at runtime, the set of prompts available is fixed and deterministic.The MLOps analogy here is, instead of dynamically reloading models, you instead bake the ML model into the container/hard code the reference. Once deployed, your app has everything that it needs. The deployment is immutable; nothing changes once it’s up. This makes debugging & determining what’s going on, much simpler.MLOps Analogy: make an immutable deployment by making the model fixed for your app’s deployment.Diagram showing how treating prompts as code enables you to leverage your CI/CD and build an immutable deployment for talking to your LLM API.This approach has many operational benefits:Whenever a new prompt is pushed, it forces a new deployment. Rollback semantics are clear if there’s an issue with a new prompt.You can submit a pull request (PR) for the source code and prompts at the same time. It becomes simpler to review what the change is, and the downstream dependencies of what these prompts will touch/interact with.You can add checks to your CI/CD system to ensure bad prompts don’t make it to production.It’s simpler to debug an issue. You just pull the (Docker) container that was created and you’ll be able to exactly replicate any customer issue quickly and easily.There is no other “prompt system” to maintain or manage. Simplifying operations.It doesn’t preclude adding extra monitoring and visibility.How it would work with HamiltonThe prompts would be encoded into functions into the dataflow/directed acyclic graph (DAG):What summarization.py in the PDF summarizer example looks like. The prompt templates are part of the code. Diagram created via Hamilton.Pairing this code withgit, we have a lightweight versioning system for your entire dataflow (i.e. “chain”), so you can always discern what state the world was in, given a git commit SHA. If you want to manage and have access to multiple prompts at any given point in time, Hamilton has two powerful abstractions to enable you to do so:@config.whenandPython modules. This allows you to store and keep available all older prompt versions together and specify which one to use via code.@config.when (docs)Hamilton has a concept of decorators, which are just annotations on functions. The@config.whendecorator allows to specify alternative implementations for a functions, i.e. “node”, in your dataflow. In this case, we specify alternative prompts.from hamilton.function_modifiers import config\\n\\n@config.when(version=\"v1\")\\ndef summarize_chunk_of_text_prompt__v1() -> str:\\n    \"\"\"V1 prompt for summarizing chunks of text.\"\"\"\\n    return f\"Summarize this text. Extract any key points with reasoning.\\\\n\\\\nContent:\"\\n\\n@config.when(version=\"v2\")\\ndef summarize_chunk_of_text_prompt__v2(content_type: str = \"an academic paper\") -> str:\\n    \"\"\"V2 prompt for summarizing chunks of text.\"\"\"\\n    return f\"Summarize this text from {content_type}. Extract the key points with reasoning. \\\\n\\\\nContent:\"You can keep adding functions annotated with@config.when, allowing you to swap between them using configuration passed to the HamiltonDriver. When instantiating theDriver,  it will construct the dataflow using the prompt implementation associated with the configuration value.from hamilton import base, driver\\nimport summarization\\n\\n# create driver\\ndr = (\\n    driver.Builder()\\n    .with_modules(summarization)\\n    .with_config({\"version\": \"v1\"}) # V1 is chosen. Use \"v2\\' for V2.\\n    .build()\\n)Module switchingAlternatively to using@config.when, you can instead place your different prompt implementations into different Python modules. Then, atDriverconstruction time, pass the correct module for the context you want to use.So here we have one module housing V1 of our prompt:# prompts_v1.py\\ndef summarize_chunk_of_text_prompt() -> str:\\n    \"\"\"V1 prompt for summarizing chunks of text.\"\"\"\\n    return f\"Summarize this text. Extract any key points with reasoning.\\\\n\\\\nContent:\"Here we have one module housing V2 (see how they differ slightly):# prompts_v2.py\\ndef summarize_chunk_of_text_prompt(content_type: str = \"an academic paper\") -> str:\\n    \"\"\"V2 prompt for summarizing chunks of text.\"\"\"\\n    return f\"Summarize this text from {content_type}. Extract the key points with reasoning. \\\\n\\\\nContent:\"In the driver code below, we choose the right module to use based on some context.# run.py\\nfrom hamilton import driver\\nimport summarization\\nimport prompts_v1\\nimport prompts_v2\\n\\n# create driver -- passing in the right module we want\\ndr = (\\n    driver.Builder()\\n    .with_modules(\\n        prompts_v1,  # or prompts_v2\\n        summarization,\\n    )\\n    .build()\\n)Using the module approach allows us to encapsulate and version whole sets of prompts together. If you want to go back in time (via git), or see what a blessed prompt version was, you just need to navigate to the correct commit, and then look in the right module.How would I log prompts used and monitor flows?Assuming you’re using git to track your code, you wouldn’t need to record what prompts were being used. Instead, you’d just need to know what git commit SHA is deployed and you’ll be able to track the version of your code and prompts simultaneously.To monitor flows, just like the above approach, you have the same monitoring hooks available at your disposal, and I wont repeat them here, but they are:Request any intermediate outputs and log them yourself outside of Hamilton.Log them from within the function yourself, or build aPython decorator/GraphAdapterto do it at the framework level.Integrate 3rd party tooling for monitoring your code and LLM API calls, or use the DAGWorks Platform offering to monitor it all. (Try the free tierhere)!or all the above!What about A/B testing my prompts?With any ML initiative, it’s important to measure business impacts of changes. Likewise, with LLMs + prompts, it’ll be important to test and measure changes against important business metrics. In the MLOps world, you’d be A/B testing ML models to evaluate their business value by dividing traffic between them. To ensure the randomness necessary to A/B tests, you wouldn’t know at runtime which model to use until a coin is flipped. However, to get those models out, they both would have follow a process to qualify them. So for prompts, we should think similarly.The above two prompt engineering patterns don’t preclude you from being able to A/B test prompts, but it means you need to manage a process to enable however many prompt templates you’re testing in parallel. If you’re also adjusting code paths, having them in code will be simpler to discern and debug what is going on, and you can make use of the `@config.when` decorator / python module swapping for this purpose. Versus, having to critically rely on your logging/monitoring/observability stack to tell you what prompt was used if you’re dynamically loading/passing them in and then having to mentally map which prompts go with which code paths.Note, this all gets harder if you start needing to change multiple prompts for an A/B test because you have several of them in a flow. For example you have two prompts in your workflow and you’re changing LLMs, you’ll want to A/B test the change holistically, rather than individually per prompt. Our advice, by putting the prompts into code your operational life will be simpler, since you’ll know what two prompts belong to what code paths without having to do any mental mapping.Thank you for reading DAGWorks’s Substack. This post is public so feel free to share it.ShareSummaryIn this post, we explained two patterns for managing prompts in a production environment with Hamilton. The first approach treatsprompts asdynamic runtime variables,while the second, treatsprompts as codefor production settings. If you value reducing operational burden, then our advice is to encode prompts as code, as it is operationally simpler, unless the speed to change them really matters for you.To recap:Prompts as dynamic runtime variables. Use an external system to pass the prompts to your Hamilton dataflows, or use Hamilton to pull them from a DB. For debugging & monitoring, it’s important to be able to determine what prompt was used for a given invocation. You can integrate open source tools, or use something like the DAGWorks Platform to help ensure you know what was used for any invocation of your code.Prompts as code.Encoding the prompts as code allows easy versioning with git. Change management can be done via pull requests and CI/CD checks. It works well with Hamilton’s features like@config.whenand module switching at the Driver level because it determines clearly what version of the prompt is used. This approach strengthens the use of any tooling you might use to monitor or track, like the DAGWorks Platform, as prompts for a deployment are immutable.We want to hear from you!If you’re excited by any of this, or have strong opinions, leave a comment, or drop by our Slack channel! Some links to do praise/complain/chat:📣join our community on Slack—\\u200awe’re more than happy to help answer questions you might have or get you started.⭐️ us onGitHub.📝 leave us anissueif you find something.📚 read ourdocumentation.⌨️ interactivelylearn about Hamilton in your browser.Other Hamilton posts you might be interested in:We have a growing collection of posts & content. Here are some we think you might be interested in.Containerized PDF Summarizer with FastAPI and HamiltonThierry Jean,DAGWorks Inc., andStefan Krawczyk·Aug 18Skip learning convoluted LLM-specific frameworks and write your first LLM application using regular Python functions and Hamilton! In this post, we’ll present a containerized PDF summarizer powered by the OpenAI API. Its flow is encoded in Hamilton, which theRead full storyBuilding a maintainable and modular LLM application stack with HamiltonThierry JeanandDAGWorks Inc.·Jul 11In this post, we’re going to share how Hamilton can help you write modular and maintainable code for your large language model (LLM) application stack. Hamilton is great for describing any type of dataflow, which is exactly what you’re doing when building an LLM powered application. With Hamilton you get strongRead full storytryhamilton.dev– an interactive tutorial in your browser!Hamilton + Airflow(GitHub repo)Hamilton + Feast(GitHub repo)Pandas data transformations in Hamilton in 5 minutesLineage + Hamilton in 10 minutes4Share this postLLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.ioCopy linkFacebookEmailNoteOtherShare Share this postLLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.ioCopy linkFacebookEmailNoteOther Share this postLLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.ioCopy linkFacebookEmailNoteOther Share this postLLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.ioCopy linkFacebookEmailNoteOther Share this postLLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.ioCopy linkFacebookEmailNoteOther Share this postLLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.ioCopy linkFacebookEmailNoteOther Share this post LLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.ioCopy linkFacebookEmailNoteOther LLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.io  LLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.io blog.dagworks.io Copy linkFacebookEmailNoteOther  Copy link  Facebook  Email  Note  Other Discover more from DAGWorks’s SubstackThought posts, and updates on Hamilton and the DAGWorks Platform.SubscribeContinue readingSign in   Thought posts, and updates on Hamilton and the DAGWorks Platform.   Continue reading Continue reading Sign in LLMOps: Production prompt engineering patterns with HamiltonAn overview of the production grade ways to iterate on prompts with Hamilton.DAGWorks Inc.,Stefan Krawczyk, andThierry JeanSep 6, 20234Share this postLLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.ioCopy linkFacebookEmailNoteOtherShare DAGWorks Inc.,Stefan Krawczyk, andThierry JeanSep 6, 20234Share this postLLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.ioCopy linkFacebookEmailNoteOtherShare DAGWorks Inc.,Stefan Krawczyk, andThierry JeanSep 6, 2023 DAGWorks Inc.,Stefan Krawczyk, andThierry JeanSep 6, 2023        DAGWorks Inc.,Stefan Krawczyk, andThierry JeanSep 6, 2023 DAGWorks Inc.,Stefan Krawczyk, andThierry Jean DAGWorks Inc. Stefan Krawczyk Thierry Jean Sep 6, 2023 Sep 6, 2023 4Share this postLLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.ioCopy linkFacebookEmailNoteOtherShare 4Share this postLLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.ioCopy linkFacebookEmailNoteOther 4Share this postLLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.ioCopy linkFacebookEmailNoteOther 4 Share this postLLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.ioCopy linkFacebookEmailNoteOther Share this postLLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.ioCopy linkFacebookEmailNoteOther Share this postLLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.ioCopy linkFacebookEmailNoteOther Share this postLLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.ioCopy linkFacebookEmailNoteOther Share this postLLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.ioCopy linkFacebookEmailNoteOther Share this post LLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.ioCopy linkFacebookEmailNoteOther LLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.io  LLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.io blog.dagworks.io Copy linkFacebookEmailNoteOther  Copy link  Facebook  Email  Note  Other Share Share  What you send to your LLM is quite important. Small variations and changes can have large impacts on outputs, so as your product evolves, the need to evolve your prompts will too. LLMs are also constantly being developed and released, and so as LLMs change, your prompts will also need to change. Therefore it’s important to set up an iteration pattern to operationalize how you “deploy” your prompts so you and your team can move efficiently, but also ensure that production issues are minimized, if not avoided. In this post, we’ll guide you through the best practices of managing prompts with Hamilton, making analogies to MLOps patterns, and discussing trade-offs along the way.Notes:(1): if you’re looking for a post that talks about “context management” this isn’t that post. But it is the post that will help you with the nuts and bolts on how to iterate and create that production grade “prompt context management” iteration story.(2): we’ll use prompt & prompt template interchangeably.(3): we’ll assume an “online” web-service setting is where these prompts are being used.(4): we’ll be using ourHamilton’s PDF summarizer exampleto project our patterns onto.(5): not familiar withHamilton? You can either learn about Hamilton viaTry Hamiltonand come back, or get the high level LLMOps approach from this post and then dig into Hamilton via thePDF Summarizer example.(6): what’s our credibility here? We’ve spent our careers building self-service data/MLOps tooling, most famously for Stitch Fix’s 100+ Data Scientists. So we’ve seen our share of outages and approaches play out over time.Thanks for reading DAGWorks’s Substack! Subscribe for free to receive updates and posts like this!SubscribePrompts are to LLMs what hyper-parameters are to ML modelsPoint:Prompts + LLM APIs are analogous to hyper-parameters + machine learning models.In terms of “Ops” practices, LLMOps is still in its infancy. MLOps is a little older, but still neither are widely adopted if you’re comparing it to how widespread knowledge is around DevOps practices.DevOps practices largely concern themselves with how you ship code to production, and MLOps practices how to ship code& data artifacts(e.g., statistical models)to production. So what about LLMOps? Personally, I think it’s closer to MLOps since you have:your LLM workflow is simply code.and an LLM API is a data artifact that can be “tweaked” using prompts, similar to a machine learning (ML) model and its hyper-parameters.Therefore, you most likely care about versioning the LLM API + prompts together tightly for good production practices. For instance, in MLOps practice,  you’d want a process in place to validate your ML model still behaves correctly whenever its hyper-parameters are changed.How should you think about operationalizing a prompt?To be clear, the two parts to control for are theLLMand theprompts. Much like  MLOps, when the code or the model artifact changes, you want to be able to determine which did. For LLMOps, we’ll want the same discernment, separating the LLM workflow from the LLM API + prompts. Importantly, we should consider LLMs (self-hosted or APIs) to be mostly static since we less frequently update (or even control) their internals. So, changing thepromptspart of LLM API + prompts is effectively like creating a new model artifact.There are two main ways to treat prompts:Prompts as dynamic runtime variables. The template used isn’t static to a deployment.Prompts as code.The prompt template is static/ predetermined given a deployment.The main difference is the amount of moving parts you need to manage to ensure a great production story. Below, we dig into how to use Hamilton in the context of these two approaches.Prompts as dynamic runtime variablesDynamically Pass/Load PromptsPrompts are just strings. Since strings are a primitive type in most languages, this means that they are quite easy to pass around.\\xa0 The idea is to abstract your code so that at runtime you pass in the prompts required.\\xa0 More concretely, you’d “load/reload” prompt templates whenever there’s an “updated” one.The MLOps analogy here, would be to auto-reload the ML model artifact (e.g., a pkl file) whenever a new model is available.MLOps Analogy: diagram showing how ML model auto reloading would look.Diagram showing what dynamically reloading/querying prompts would look like.The benefit here is that you can very quickly roll out new prompts because you do not need to redeploy your application!The downside to this iteration speed is increased operational burden:To someone monitoring your application, it’ll be unclear when the change occurred and whether it’s propagated itself through your systems. For example, you just pushed a new prompt, and the LLM now returns more tokens per request, causing latency to spike; whoever is monitoring will likely be puzzled, unless you have a great change log culture.Rollback semantics involve having to know aboutanothersystem. You can’t just rollback a prior deployment to fix things.You’ll need great monitoring to understand what was run and when; e.g., when customer service gives you a ticket to investigate, how do you know what prompt was in use?You’ll need to manage and monitor whatever system you’re using to manage and store your prompts. This will be an extra system you’ll need to maintain outside of whatever is serving your code.You’ll need to manage two processes, one for updating and pushing the service, and one for updating and pushing prompts. Synchronizing these changes will be on you. For example, you need to make a code change to your service to handle a new prompt. You will need to coordinate changing two systems to make it work, which is extra operational overhead to manage.How it would work with HamiltonOur PDF summarizer flow would look something like this if you removesummarize_text_from_summaries_promptandsummarize_chunk_of_text_promptfunction definitions:summarization_shortened.py. Note the two inputs “*_prompt” that denote prompts that are now required as input to the dataflow to function. With Hamilton you’ll be able to determine what inputs should be required for your prompt template by just looking at a diagram like this. Diagram created via Hamilton.To operate things, you’ll want to either inject the prompts at request time:from hamilton import base, driver\\nimport summarization_shortend\\n\\n# create driver\\ndr = (\\n    driver.Builder()\\n    .with_modules(summarization_sortened)\\n    .build()\\n)\\n\\n# pull prompts from somewhere\\nsummarize_chunk_of_text_prompt = \"\"\"SOME PROMPT FOR {chunked_text}\"\"\"\\nsummarize_text_from_summaries_prompt = \"\"\"SOME PROMPT {summarized_chunks} ... {user_query}\"\"\"\\n\\n# execute, and pass in the prompt \\nresult = dr.execute(\\n   [\"summarized_text\"],\\n   inputs={\\n      \"summarize_chunk_of_text_prompt\": summarize_chunk_of_text_prompt,\\n      ...\\n   }\\n)Or\\xa0you change your code to dynamically load prompts, i.e., add functions to retrieve prompts from an external system as part of the Hamilton dataflow. At each invocation, they will query for the prompt to use (you can of course cache this for performance):# prompt_template_loaders.py\\n\\ndef summarize_chunk_of_text_prompt(\\n  db_client: Client, other_args: str) -> str:\\n    # pseudo code here, but you get the idea:\\n    _prompt = db_client.query( \\n         \"get latest prompt X from DB\", other_args)\\n    return _prompt\\n\\ndef summarize_text_from_summaries_prompt(\\n   db_client: Client, another_arg: str) -> str:\\n    # pseudo code here, but you get the idea:\\n    _prompt = db_client.query(\\n         \"get latest prompt Y from DB\", another_arg)\\n    return _promptDriver code:from hamilton import base, driver\\nimport prompt_template_loaders # <-- load this to provide prompt input\\nimport summarization_shortend\\n\\n# create driver\\ndr = (\\n    driver.Builder()\\n    .with_modules(\\n        prompt_template_loaders,# <-- Hamilton will call above functions\\n        summarization_sortened, \\n    )\\n    .build()\\n)\\n\\n# execute, and pass in the prompt \\nresult = dr.execute(\\n   [\"summarized_text\"],\\n   inputs={\\n      # don\\'t need to pass prompts in this version\\n   }\\n)How would I log prompts used and monitor flows?Here we outline a few ways to monitor what went on.Log results of execution. That is run Hamilton, then emit information to wherever you want it to go.result = dr.execute(\\n   [\"summarized_text\", \\n    \"summarize_chunk_of_text_prompt\",   \\n    ... # and anything else you want to pull out\\n    \"summarize_text_from_summaries_prompt\"],\\n   inputs={\\n      # don\\'t need to pass prompts in this version\\n   }\\n)\\n\\nmy_log_system(result) # send what you want for safe keeping to some\\n                      # system that you own.Note. In the above, Hamilton allows you to requestanyintermediateoutputs simply by requesting “functions” (i.e. nodes in the diagram) by name. If we really want to get all the intermediate outputs of the entire dataflow, we can do so and log it wherever we want to!Use loggers inside Hamilton functions (to see the power of this approach,see my old talk on structured logs):import logging\\n\\nlogger = logging.getLogger(__name__)\\n\\ndef summarize_text_from_summaries_prompt(\\n    db_client: Client, another_arg: str) -> str:\\n    # pseudo code here, but you get the idea:\\n    _prompt = db_client.query(\\n         \"get latest prompt Y from DB\", another_arg)\\n    logger.info(f\"Prompt used is [{_prompt}]\")\\n    return _promptExtend Hamilton to emit this information. You use Hamilton to capture information from executed functions, i.e. nodes, without needing to insert logging statement inside the function’s body. This promotes reusability since you can toggle logging between development and production settings at the Driver level. SeeGraphAdapters, or write your ownPython decoratorto wrap functions for monitoring.In any of the above code, you could easily pull in a 3rd party tool to help track & monitor the code, as well as the external API call, e.g. data dog. Note, with a one-line code change, you can plug in the DAGWorks’s Driver and get all that monitoring you’d want and more. (Try the free tierhere)!Prompts as codePrompts as static stringsSince prompts are simply strings, they’re also very amenable to being stored along with your source code. The idea is to store as many prompt versions as you like, within your code so that at runtime, the set of prompts available is fixed and deterministic.The MLOps analogy here is, instead of dynamically reloading models, you instead bake the ML model into the container/hard code the reference. Once deployed, your app has everything that it needs. The deployment is immutable; nothing changes once it’s up. This makes debugging & determining what’s going on, much simpler.MLOps Analogy: make an immutable deployment by making the model fixed for your app’s deployment.Diagram showing how treating prompts as code enables you to leverage your CI/CD and build an immutable deployment for talking to your LLM API.This approach has many operational benefits:Whenever a new prompt is pushed, it forces a new deployment. Rollback semantics are clear if there’s an issue with a new prompt.You can submit a pull request (PR) for the source code and prompts at the same time. It becomes simpler to review what the change is, and the downstream dependencies of what these prompts will touch/interact with.You can add checks to your CI/CD system to ensure bad prompts don’t make it to production.It’s simpler to debug an issue. You just pull the (Docker) container that was created and you’ll be able to exactly replicate any customer issue quickly and easily.There is no other “prompt system” to maintain or manage. Simplifying operations.It doesn’t preclude adding extra monitoring and visibility.How it would work with HamiltonThe prompts would be encoded into functions into the dataflow/directed acyclic graph (DAG):What summarization.py in the PDF summarizer example looks like. The prompt templates are part of the code. Diagram created via Hamilton.Pairing this code withgit, we have a lightweight versioning system for your entire dataflow (i.e. “chain”), so you can always discern what state the world was in, given a git commit SHA. If you want to manage and have access to multiple prompts at any given point in time, Hamilton has two powerful abstractions to enable you to do so:@config.whenandPython modules. This allows you to store and keep available all older prompt versions together and specify which one to use via code.@config.when (docs)Hamilton has a concept of decorators, which are just annotations on functions. The@config.whendecorator allows to specify alternative implementations for a functions, i.e. “node”, in your dataflow. In this case, we specify alternative prompts.from hamilton.function_modifiers import config\\n\\n@config.when(version=\"v1\")\\ndef summarize_chunk_of_text_prompt__v1() -> str:\\n    \"\"\"V1 prompt for summarizing chunks of text.\"\"\"\\n    return f\"Summarize this text. Extract any key points with reasoning.\\\\n\\\\nContent:\"\\n\\n@config.when(version=\"v2\")\\ndef summarize_chunk_of_text_prompt__v2(content_type: str = \"an academic paper\") -> str:\\n    \"\"\"V2 prompt for summarizing chunks of text.\"\"\"\\n    return f\"Summarize this text from {content_type}. Extract the key points with reasoning. \\\\n\\\\nContent:\"You can keep adding functions annotated with@config.when, allowing you to swap between them using configuration passed to the HamiltonDriver. When instantiating theDriver,  it will construct the dataflow using the prompt implementation associated with the configuration value.from hamilton import base, driver\\nimport summarization\\n\\n# create driver\\ndr = (\\n    driver.Builder()\\n    .with_modules(summarization)\\n    .with_config({\"version\": \"v1\"}) # V1 is chosen. Use \"v2\\' for V2.\\n    .build()\\n)Module switchingAlternatively to using@config.when, you can instead place your different prompt implementations into different Python modules. Then, atDriverconstruction time, pass the correct module for the context you want to use.So here we have one module housing V1 of our prompt:# prompts_v1.py\\ndef summarize_chunk_of_text_prompt() -> str:\\n    \"\"\"V1 prompt for summarizing chunks of text.\"\"\"\\n    return f\"Summarize this text. Extract any key points with reasoning.\\\\n\\\\nContent:\"Here we have one module housing V2 (see how they differ slightly):# prompts_v2.py\\ndef summarize_chunk_of_text_prompt(content_type: str = \"an academic paper\") -> str:\\n    \"\"\"V2 prompt for summarizing chunks of text.\"\"\"\\n    return f\"Summarize this text from {content_type}. Extract the key points with reasoning. \\\\n\\\\nContent:\"In the driver code below, we choose the right module to use based on some context.# run.py\\nfrom hamilton import driver\\nimport summarization\\nimport prompts_v1\\nimport prompts_v2\\n\\n# create driver -- passing in the right module we want\\ndr = (\\n    driver.Builder()\\n    .with_modules(\\n        prompts_v1,  # or prompts_v2\\n        summarization,\\n    )\\n    .build()\\n)Using the module approach allows us to encapsulate and version whole sets of prompts together. If you want to go back in time (via git), or see what a blessed prompt version was, you just need to navigate to the correct commit, and then look in the right module.How would I log prompts used and monitor flows?Assuming you’re using git to track your code, you wouldn’t need to record what prompts were being used. Instead, you’d just need to know what git commit SHA is deployed and you’ll be able to track the version of your code and prompts simultaneously.To monitor flows, just like the above approach, you have the same monitoring hooks available at your disposal, and I wont repeat them here, but they are:Request any intermediate outputs and log them yourself outside of Hamilton.Log them from within the function yourself, or build aPython decorator/GraphAdapterto do it at the framework level.Integrate 3rd party tooling for monitoring your code and LLM API calls, or use the DAGWorks Platform offering to monitor it all. (Try the free tierhere)!or all the above!What about A/B testing my prompts?With any ML initiative, it’s important to measure business impacts of changes. Likewise, with LLMs + prompts, it’ll be important to test and measure changes against important business metrics. In the MLOps world, you’d be A/B testing ML models to evaluate their business value by dividing traffic between them. To ensure the randomness necessary to A/B tests, you wouldn’t know at runtime which model to use until a coin is flipped. However, to get those models out, they both would have follow a process to qualify them. So for prompts, we should think similarly.The above two prompt engineering patterns don’t preclude you from being able to A/B test prompts, but it means you need to manage a process to enable however many prompt templates you’re testing in parallel. If you’re also adjusting code paths, having them in code will be simpler to discern and debug what is going on, and you can make use of the `@config.when` decorator / python module swapping for this purpose. Versus, having to critically rely on your logging/monitoring/observability stack to tell you what prompt was used if you’re dynamically loading/passing them in and then having to mentally map which prompts go with which code paths.Note, this all gets harder if you start needing to change multiple prompts for an A/B test because you have several of them in a flow. For example you have two prompts in your workflow and you’re changing LLMs, you’ll want to A/B test the change holistically, rather than individually per prompt. Our advice, by putting the prompts into code your operational life will be simpler, since you’ll know what two prompts belong to what code paths without having to do any mental mapping.Thank you for reading DAGWorks’s Substack. This post is public so feel free to share it.ShareSummaryIn this post, we explained two patterns for managing prompts in a production environment with Hamilton. The first approach treatsprompts asdynamic runtime variables,while the second, treatsprompts as codefor production settings. If you value reducing operational burden, then our advice is to encode prompts as code, as it is operationally simpler, unless the speed to change them really matters for you.To recap:Prompts as dynamic runtime variables. Use an external system to pass the prompts to your Hamilton dataflows, or use Hamilton to pull them from a DB. For debugging & monitoring, it’s important to be able to determine what prompt was used for a given invocation. You can integrate open source tools, or use something like the DAGWorks Platform to help ensure you know what was used for any invocation of your code.Prompts as code.Encoding the prompts as code allows easy versioning with git. Change management can be done via pull requests and CI/CD checks. It works well with Hamilton’s features like@config.whenand module switching at the Driver level because it determines clearly what version of the prompt is used. This approach strengthens the use of any tooling you might use to monitor or track, like the DAGWorks Platform, as prompts for a deployment are immutable.We want to hear from you!If you’re excited by any of this, or have strong opinions, leave a comment, or drop by our Slack channel! Some links to do praise/complain/chat:📣join our community on Slack—\\u200awe’re more than happy to help answer questions you might have or get you started.⭐️ us onGitHub.📝 leave us anissueif you find something.📚 read ourdocumentation.⌨️ interactivelylearn about Hamilton in your browser.Other Hamilton posts you might be interested in:We have a growing collection of posts & content. Here are some we think you might be interested in.Containerized PDF Summarizer with FastAPI and HamiltonThierry Jean,DAGWorks Inc., andStefan Krawczyk·Aug 18Skip learning convoluted LLM-specific frameworks and write your first LLM application using regular Python functions and Hamilton! In this post, we’ll present a containerized PDF summarizer powered by the OpenAI API. Its flow is encoded in Hamilton, which theRead full storyBuilding a maintainable and modular LLM application stack with HamiltonThierry JeanandDAGWorks Inc.·Jul 11In this post, we’re going to share how Hamilton can help you write modular and maintainable code for your large language model (LLM) application stack. Hamilton is great for describing any type of dataflow, which is exactly what you’re doing when building an LLM powered application. With Hamilton you get strongRead full storytryhamilton.dev– an interactive tutorial in your browser!Hamilton + Airflow(GitHub repo)Hamilton + Feast(GitHub repo)Pandas data transformations in Hamilton in 5 minutesLineage + Hamilton in 10 minutes4Share this postLLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.ioCopy linkFacebookEmailNoteOtherShare What you send to your LLM is quite important. Small variations and changes can have large impacts on outputs, so as your product evolves, the need to evolve your prompts will too. LLMs are also constantly being developed and released, and so as LLMs change, your prompts will also need to change. Therefore it’s important to set up an iteration pattern to operationalize how you “deploy” your prompts so you and your team can move efficiently, but also ensure that production issues are minimized, if not avoided. In this post, we’ll guide you through the best practices of managing prompts with Hamilton, making analogies to MLOps patterns, and discussing trade-offs along the way.Notes:(1): if you’re looking for a post that talks about “context management” this isn’t that post. But it is the post that will help you with the nuts and bolts on how to iterate and create that production grade “prompt context management” iteration story.(2): we’ll use prompt & prompt template interchangeably.(3): we’ll assume an “online” web-service setting is where these prompts are being used.(4): we’ll be using ourHamilton’s PDF summarizer exampleto project our patterns onto.(5): not familiar withHamilton? You can either learn about Hamilton viaTry Hamiltonand come back, or get the high level LLMOps approach from this post and then dig into Hamilton via thePDF Summarizer example.(6): what’s our credibility here? We’ve spent our careers building self-service data/MLOps tooling, most famously for Stitch Fix’s 100+ Data Scientists. So we’ve seen our share of outages and approaches play out over time.Thanks for reading DAGWorks’s Substack! Subscribe for free to receive updates and posts like this!SubscribePrompts are to LLMs what hyper-parameters are to ML modelsPoint:Prompts + LLM APIs are analogous to hyper-parameters + machine learning models.In terms of “Ops” practices, LLMOps is still in its infancy. MLOps is a little older, but still neither are widely adopted if you’re comparing it to how widespread knowledge is around DevOps practices.DevOps practices largely concern themselves with how you ship code to production, and MLOps practices how to ship code& data artifacts(e.g., statistical models)to production. So what about LLMOps? Personally, I think it’s closer to MLOps since you have:your LLM workflow is simply code.and an LLM API is a data artifact that can be “tweaked” using prompts, similar to a machine learning (ML) model and its hyper-parameters.Therefore, you most likely care about versioning the LLM API + prompts together tightly for good production practices. For instance, in MLOps practice,  you’d want a process in place to validate your ML model still behaves correctly whenever its hyper-parameters are changed.How should you think about operationalizing a prompt?To be clear, the two parts to control for are theLLMand theprompts. Much like  MLOps, when the code or the model artifact changes, you want to be able to determine which did. For LLMOps, we’ll want the same discernment, separating the LLM workflow from the LLM API + prompts. Importantly, we should consider LLMs (self-hosted or APIs) to be mostly static since we less frequently update (or even control) their internals. So, changing thepromptspart of LLM API + prompts is effectively like creating a new model artifact.There are two main ways to treat prompts:Prompts as dynamic runtime variables. The template used isn’t static to a deployment.Prompts as code.The prompt template is static/ predetermined given a deployment.The main difference is the amount of moving parts you need to manage to ensure a great production story. Below, we dig into how to use Hamilton in the context of these two approaches.Prompts as dynamic runtime variablesDynamically Pass/Load PromptsPrompts are just strings. Since strings are a primitive type in most languages, this means that they are quite easy to pass around.\\xa0 The idea is to abstract your code so that at runtime you pass in the prompts required.\\xa0 More concretely, you’d “load/reload” prompt templates whenever there’s an “updated” one.The MLOps analogy here, would be to auto-reload the ML model artifact (e.g., a pkl file) whenever a new model is available.MLOps Analogy: diagram showing how ML model auto reloading would look.Diagram showing what dynamically reloading/querying prompts would look like.The benefit here is that you can very quickly roll out new prompts because you do not need to redeploy your application!The downside to this iteration speed is increased operational burden:To someone monitoring your application, it’ll be unclear when the change occurred and whether it’s propagated itself through your systems. For example, you just pushed a new prompt, and the LLM now returns more tokens per request, causing latency to spike; whoever is monitoring will likely be puzzled, unless you have a great change log culture.Rollback semantics involve having to know aboutanothersystem. You can’t just rollback a prior deployment to fix things.You’ll need great monitoring to understand what was run and when; e.g., when customer service gives you a ticket to investigate, how do you know what prompt was in use?You’ll need to manage and monitor whatever system you’re using to manage and store your prompts. This will be an extra system you’ll need to maintain outside of whatever is serving your code.You’ll need to manage two processes, one for updating and pushing the service, and one for updating and pushing prompts. Synchronizing these changes will be on you. For example, you need to make a code change to your service to handle a new prompt. You will need to coordinate changing two systems to make it work, which is extra operational overhead to manage.How it would work with HamiltonOur PDF summarizer flow would look something like this if you removesummarize_text_from_summaries_promptandsummarize_chunk_of_text_promptfunction definitions:summarization_shortened.py. Note the two inputs “*_prompt” that denote prompts that are now required as input to the dataflow to function. With Hamilton you’ll be able to determine what inputs should be required for your prompt template by just looking at a diagram like this. Diagram created via Hamilton.To operate things, you’ll want to either inject the prompts at request time:from hamilton import base, driver\\nimport summarization_shortend\\n\\n# create driver\\ndr = (\\n    driver.Builder()\\n    .with_modules(summarization_sortened)\\n    .build()\\n)\\n\\n# pull prompts from somewhere\\nsummarize_chunk_of_text_prompt = \"\"\"SOME PROMPT FOR {chunked_text}\"\"\"\\nsummarize_text_from_summaries_prompt = \"\"\"SOME PROMPT {summarized_chunks} ... {user_query}\"\"\"\\n\\n# execute, and pass in the prompt \\nresult = dr.execute(\\n   [\"summarized_text\"],\\n   inputs={\\n      \"summarize_chunk_of_text_prompt\": summarize_chunk_of_text_prompt,\\n      ...\\n   }\\n)Or\\xa0you change your code to dynamically load prompts, i.e., add functions to retrieve prompts from an external system as part of the Hamilton dataflow. At each invocation, they will query for the prompt to use (you can of course cache this for performance):# prompt_template_loaders.py\\n\\ndef summarize_chunk_of_text_prompt(\\n  db_client: Client, other_args: str) -> str:\\n    # pseudo code here, but you get the idea:\\n    _prompt = db_client.query( \\n         \"get latest prompt X from DB\", other_args)\\n    return _prompt\\n\\ndef summarize_text_from_summaries_prompt(\\n   db_client: Client, another_arg: str) -> str:\\n    # pseudo code here, but you get the idea:\\n    _prompt = db_client.query(\\n         \"get latest prompt Y from DB\", another_arg)\\n    return _promptDriver code:from hamilton import base, driver\\nimport prompt_template_loaders # <-- load this to provide prompt input\\nimport summarization_shortend\\n\\n# create driver\\ndr = (\\n    driver.Builder()\\n    .with_modules(\\n        prompt_template_loaders,# <-- Hamilton will call above functions\\n        summarization_sortened, \\n    )\\n    .build()\\n)\\n\\n# execute, and pass in the prompt \\nresult = dr.execute(\\n   [\"summarized_text\"],\\n   inputs={\\n      # don\\'t need to pass prompts in this version\\n   }\\n)How would I log prompts used and monitor flows?Here we outline a few ways to monitor what went on.Log results of execution. That is run Hamilton, then emit information to wherever you want it to go.result = dr.execute(\\n   [\"summarized_text\", \\n    \"summarize_chunk_of_text_prompt\",   \\n    ... # and anything else you want to pull out\\n    \"summarize_text_from_summaries_prompt\"],\\n   inputs={\\n      # don\\'t need to pass prompts in this version\\n   }\\n)\\n\\nmy_log_system(result) # send what you want for safe keeping to some\\n                      # system that you own.Note. In the above, Hamilton allows you to requestanyintermediateoutputs simply by requesting “functions” (i.e. nodes in the diagram) by name. If we really want to get all the intermediate outputs of the entire dataflow, we can do so and log it wherever we want to!Use loggers inside Hamilton functions (to see the power of this approach,see my old talk on structured logs):import logging\\n\\nlogger = logging.getLogger(__name__)\\n\\ndef summarize_text_from_summaries_prompt(\\n    db_client: Client, another_arg: str) -> str:\\n    # pseudo code here, but you get the idea:\\n    _prompt = db_client.query(\\n         \"get latest prompt Y from DB\", another_arg)\\n    logger.info(f\"Prompt used is [{_prompt}]\")\\n    return _promptExtend Hamilton to emit this information. You use Hamilton to capture information from executed functions, i.e. nodes, without needing to insert logging statement inside the function’s body. This promotes reusability since you can toggle logging between development and production settings at the Driver level. SeeGraphAdapters, or write your ownPython decoratorto wrap functions for monitoring.In any of the above code, you could easily pull in a 3rd party tool to help track & monitor the code, as well as the external API call, e.g. data dog. Note, with a one-line code change, you can plug in the DAGWorks’s Driver and get all that monitoring you’d want and more. (Try the free tierhere)!Prompts as codePrompts as static stringsSince prompts are simply strings, they’re also very amenable to being stored along with your source code. The idea is to store as many prompt versions as you like, within your code so that at runtime, the set of prompts available is fixed and deterministic.The MLOps analogy here is, instead of dynamically reloading models, you instead bake the ML model into the container/hard code the reference. Once deployed, your app has everything that it needs. The deployment is immutable; nothing changes once it’s up. This makes debugging & determining what’s going on, much simpler.MLOps Analogy: make an immutable deployment by making the model fixed for your app’s deployment.Diagram showing how treating prompts as code enables you to leverage your CI/CD and build an immutable deployment for talking to your LLM API.This approach has many operational benefits:Whenever a new prompt is pushed, it forces a new deployment. Rollback semantics are clear if there’s an issue with a new prompt.You can submit a pull request (PR) for the source code and prompts at the same time. It becomes simpler to review what the change is, and the downstream dependencies of what these prompts will touch/interact with.You can add checks to your CI/CD system to ensure bad prompts don’t make it to production.It’s simpler to debug an issue. You just pull the (Docker) container that was created and you’ll be able to exactly replicate any customer issue quickly and easily.There is no other “prompt system” to maintain or manage. Simplifying operations.It doesn’t preclude adding extra monitoring and visibility.How it would work with HamiltonThe prompts would be encoded into functions into the dataflow/directed acyclic graph (DAG):What summarization.py in the PDF summarizer example looks like. The prompt templates are part of the code. Diagram created via Hamilton.Pairing this code withgit, we have a lightweight versioning system for your entire dataflow (i.e. “chain”), so you can always discern what state the world was in, given a git commit SHA. If you want to manage and have access to multiple prompts at any given point in time, Hamilton has two powerful abstractions to enable you to do so:@config.whenandPython modules. This allows you to store and keep available all older prompt versions together and specify which one to use via code.@config.when (docs)Hamilton has a concept of decorators, which are just annotations on functions. The@config.whendecorator allows to specify alternative implementations for a functions, i.e. “node”, in your dataflow. In this case, we specify alternative prompts.from hamilton.function_modifiers import config\\n\\n@config.when(version=\"v1\")\\ndef summarize_chunk_of_text_prompt__v1() -> str:\\n    \"\"\"V1 prompt for summarizing chunks of text.\"\"\"\\n    return f\"Summarize this text. Extract any key points with reasoning.\\\\n\\\\nContent:\"\\n\\n@config.when(version=\"v2\")\\ndef summarize_chunk_of_text_prompt__v2(content_type: str = \"an academic paper\") -> str:\\n    \"\"\"V2 prompt for summarizing chunks of text.\"\"\"\\n    return f\"Summarize this text from {content_type}. Extract the key points with reasoning. \\\\n\\\\nContent:\"You can keep adding functions annotated with@config.when, allowing you to swap between them using configuration passed to the HamiltonDriver. When instantiating theDriver,  it will construct the dataflow using the prompt implementation associated with the configuration value.from hamilton import base, driver\\nimport summarization\\n\\n# create driver\\ndr = (\\n    driver.Builder()\\n    .with_modules(summarization)\\n    .with_config({\"version\": \"v1\"}) # V1 is chosen. Use \"v2\\' for V2.\\n    .build()\\n)Module switchingAlternatively to using@config.when, you can instead place your different prompt implementations into different Python modules. Then, atDriverconstruction time, pass the correct module for the context you want to use.So here we have one module housing V1 of our prompt:# prompts_v1.py\\ndef summarize_chunk_of_text_prompt() -> str:\\n    \"\"\"V1 prompt for summarizing chunks of text.\"\"\"\\n    return f\"Summarize this text. Extract any key points with reasoning.\\\\n\\\\nContent:\"Here we have one module housing V2 (see how they differ slightly):# prompts_v2.py\\ndef summarize_chunk_of_text_prompt(content_type: str = \"an academic paper\") -> str:\\n    \"\"\"V2 prompt for summarizing chunks of text.\"\"\"\\n    return f\"Summarize this text from {content_type}. Extract the key points with reasoning. \\\\n\\\\nContent:\"In the driver code below, we choose the right module to use based on some context.# run.py\\nfrom hamilton import driver\\nimport summarization\\nimport prompts_v1\\nimport prompts_v2\\n\\n# create driver -- passing in the right module we want\\ndr = (\\n    driver.Builder()\\n    .with_modules(\\n        prompts_v1,  # or prompts_v2\\n        summarization,\\n    )\\n    .build()\\n)Using the module approach allows us to encapsulate and version whole sets of prompts together. If you want to go back in time (via git), or see what a blessed prompt version was, you just need to navigate to the correct commit, and then look in the right module.How would I log prompts used and monitor flows?Assuming you’re using git to track your code, you wouldn’t need to record what prompts were being used. Instead, you’d just need to know what git commit SHA is deployed and you’ll be able to track the version of your code and prompts simultaneously.To monitor flows, just like the above approach, you have the same monitoring hooks available at your disposal, and I wont repeat them here, but they are:Request any intermediate outputs and log them yourself outside of Hamilton.Log them from within the function yourself, or build aPython decorator/GraphAdapterto do it at the framework level.Integrate 3rd party tooling for monitoring your code and LLM API calls, or use the DAGWorks Platform offering to monitor it all. (Try the free tierhere)!or all the above!What about A/B testing my prompts?With any ML initiative, it’s important to measure business impacts of changes. Likewise, with LLMs + prompts, it’ll be important to test and measure changes against important business metrics. In the MLOps world, you’d be A/B testing ML models to evaluate their business value by dividing traffic between them. To ensure the randomness necessary to A/B tests, you wouldn’t know at runtime which model to use until a coin is flipped. However, to get those models out, they both would have follow a process to qualify them. So for prompts, we should think similarly.The above two prompt engineering patterns don’t preclude you from being able to A/B test prompts, but it means you need to manage a process to enable however many prompt templates you’re testing in parallel. If you’re also adjusting code paths, having them in code will be simpler to discern and debug what is going on, and you can make use of the `@config.when` decorator / python module swapping for this purpose. Versus, having to critically rely on your logging/monitoring/observability stack to tell you what prompt was used if you’re dynamically loading/passing them in and then having to mentally map which prompts go with which code paths.Note, this all gets harder if you start needing to change multiple prompts for an A/B test because you have several of them in a flow. For example you have two prompts in your workflow and you’re changing LLMs, you’ll want to A/B test the change holistically, rather than individually per prompt. Our advice, by putting the prompts into code your operational life will be simpler, since you’ll know what two prompts belong to what code paths without having to do any mental mapping.Thank you for reading DAGWorks’s Substack. This post is public so feel free to share it.ShareSummaryIn this post, we explained two patterns for managing prompts in a production environment with Hamilton. The first approach treatsprompts asdynamic runtime variables,while the second, treatsprompts as codefor production settings. If you value reducing operational burden, then our advice is to encode prompts as code, as it is operationally simpler, unless the speed to change them really matters for you.To recap:Prompts as dynamic runtime variables. Use an external system to pass the prompts to your Hamilton dataflows, or use Hamilton to pull them from a DB. For debugging & monitoring, it’s important to be able to determine what prompt was used for a given invocation. You can integrate open source tools, or use something like the DAGWorks Platform to help ensure you know what was used for any invocation of your code.Prompts as code.Encoding the prompts as code allows easy versioning with git. Change management can be done via pull requests and CI/CD checks. It works well with Hamilton’s features like@config.whenand module switching at the Driver level because it determines clearly what version of the prompt is used. This approach strengthens the use of any tooling you might use to monitor or track, like the DAGWorks Platform, as prompts for a deployment are immutable.We want to hear from you!If you’re excited by any of this, or have strong opinions, leave a comment, or drop by our Slack channel! Some links to do praise/complain/chat:📣join our community on Slack—\\u200awe’re more than happy to help answer questions you might have or get you started.⭐️ us onGitHub.📝 leave us anissueif you find something.📚 read ourdocumentation.⌨️ interactivelylearn about Hamilton in your browser.Other Hamilton posts you might be interested in:We have a growing collection of posts & content. Here are some we think you might be interested in.Containerized PDF Summarizer with FastAPI and HamiltonThierry Jean,DAGWorks Inc., andStefan Krawczyk·Aug 18Skip learning convoluted LLM-specific frameworks and write your first LLM application using regular Python functions and Hamilton! In this post, we’ll present a containerized PDF summarizer powered by the OpenAI API. Its flow is encoded in Hamilton, which theRead full storyBuilding a maintainable and modular LLM application stack with HamiltonThierry JeanandDAGWorks Inc.·Jul 11In this post, we’re going to share how Hamilton can help you write modular and maintainable code for your large language model (LLM) application stack. Hamilton is great for describing any type of dataflow, which is exactly what you’re doing when building an LLM powered application. With Hamilton you get strongRead full storytryhamilton.dev– an interactive tutorial in your browser!Hamilton + Airflow(GitHub repo)Hamilton + Feast(GitHub repo)Pandas data transformations in Hamilton in 5 minutesLineage + Hamilton in 10 minutes What you send to your LLM is quite important. Small variations and changes can have large impacts on outputs, so as your product evolves, the need to evolve your prompts will too. LLMs are also constantly being developed and released, and so as LLMs change, your prompts will also need to change. Therefore it’s important to set up an iteration pattern to operationalize how you “deploy” your prompts so you and your team can move efficiently, but also ensure that production issues are minimized, if not avoided. In this post, we’ll guide you through the best practices of managing prompts with Hamilton, making analogies to MLOps patterns, and discussing trade-offs along the way.Notes:(1): if you’re looking for a post that talks about “context management” this isn’t that post. But it is the post that will help you with the nuts and bolts on how to iterate and create that production grade “prompt context management” iteration story.(2): we’ll use prompt & prompt template interchangeably.(3): we’ll assume an “online” web-service setting is where these prompts are being used.(4): we’ll be using ourHamilton’s PDF summarizer exampleto project our patterns onto.(5): not familiar withHamilton? You can either learn about Hamilton viaTry Hamiltonand come back, or get the high level LLMOps approach from this post and then dig into Hamilton via thePDF Summarizer example.(6): what’s our credibility here? We’ve spent our careers building self-service data/MLOps tooling, most famously for Stitch Fix’s 100+ Data Scientists. So we’ve seen our share of outages and approaches play out over time.Thanks for reading DAGWorks’s Substack! Subscribe for free to receive updates and posts like this!SubscribePrompts are to LLMs what hyper-parameters are to ML modelsPoint:Prompts + LLM APIs are analogous to hyper-parameters + machine learning models.In terms of “Ops” practices, LLMOps is still in its infancy. MLOps is a little older, but still neither are widely adopted if you’re comparing it to how widespread knowledge is around DevOps practices.DevOps practices largely concern themselves with how you ship code to production, and MLOps practices how to ship code& data artifacts(e.g., statistical models)to production. So what about LLMOps? Personally, I think it’s closer to MLOps since you have:your LLM workflow is simply code.and an LLM API is a data artifact that can be “tweaked” using prompts, similar to a machine learning (ML) model and its hyper-parameters.Therefore, you most likely care about versioning the LLM API + prompts together tightly for good production practices. For instance, in MLOps practice,  you’d want a process in place to validate your ML model still behaves correctly whenever its hyper-parameters are changed.How should you think about operationalizing a prompt?To be clear, the two parts to control for are theLLMand theprompts. Much like  MLOps, when the code or the model artifact changes, you want to be able to determine which did. For LLMOps, we’ll want the same discernment, separating the LLM workflow from the LLM API + prompts. Importantly, we should consider LLMs (self-hosted or APIs) to be mostly static since we less frequently update (or even control) their internals. So, changing thepromptspart of LLM API + prompts is effectively like creating a new model artifact.There are two main ways to treat prompts:Prompts as dynamic runtime variables. The template used isn’t static to a deployment.Prompts as code.The prompt template is static/ predetermined given a deployment.The main difference is the amount of moving parts you need to manage to ensure a great production story. Below, we dig into how to use Hamilton in the context of these two approaches.Prompts as dynamic runtime variablesDynamically Pass/Load PromptsPrompts are just strings. Since strings are a primitive type in most languages, this means that they are quite easy to pass around.\\xa0 The idea is to abstract your code so that at runtime you pass in the prompts required.\\xa0 More concretely, you’d “load/reload” prompt templates whenever there’s an “updated” one.The MLOps analogy here, would be to auto-reload the ML model artifact (e.g., a pkl file) whenever a new model is available.MLOps Analogy: diagram showing how ML model auto reloading would look.Diagram showing what dynamically reloading/querying prompts would look like.The benefit here is that you can very quickly roll out new prompts because you do not need to redeploy your application!The downside to this iteration speed is increased operational burden:To someone monitoring your application, it’ll be unclear when the change occurred and whether it’s propagated itself through your systems. For example, you just pushed a new prompt, and the LLM now returns more tokens per request, causing latency to spike; whoever is monitoring will likely be puzzled, unless you have a great change log culture.Rollback semantics involve having to know aboutanothersystem. You can’t just rollback a prior deployment to fix things.You’ll need great monitoring to understand what was run and when; e.g., when customer service gives you a ticket to investigate, how do you know what prompt was in use?You’ll need to manage and monitor whatever system you’re using to manage and store your prompts. This will be an extra system you’ll need to maintain outside of whatever is serving your code.You’ll need to manage two processes, one for updating and pushing the service, and one for updating and pushing prompts. Synchronizing these changes will be on you. For example, you need to make a code change to your service to handle a new prompt. You will need to coordinate changing two systems to make it work, which is extra operational overhead to manage.How it would work with HamiltonOur PDF summarizer flow would look something like this if you removesummarize_text_from_summaries_promptandsummarize_chunk_of_text_promptfunction definitions:summarization_shortened.py. Note the two inputs “*_prompt” that denote prompts that are now required as input to the dataflow to function. With Hamilton you’ll be able to determine what inputs should be required for your prompt template by just looking at a diagram like this. Diagram created via Hamilton.To operate things, you’ll want to either inject the prompts at request time:from hamilton import base, driver\\nimport summarization_shortend\\n\\n# create driver\\ndr = (\\n    driver.Builder()\\n    .with_modules(summarization_sortened)\\n    .build()\\n)\\n\\n# pull prompts from somewhere\\nsummarize_chunk_of_text_prompt = \"\"\"SOME PROMPT FOR {chunked_text}\"\"\"\\nsummarize_text_from_summaries_prompt = \"\"\"SOME PROMPT {summarized_chunks} ... {user_query}\"\"\"\\n\\n# execute, and pass in the prompt \\nresult = dr.execute(\\n   [\"summarized_text\"],\\n   inputs={\\n      \"summarize_chunk_of_text_prompt\": summarize_chunk_of_text_prompt,\\n      ...\\n   }\\n)Or\\xa0you change your code to dynamically load prompts, i.e., add functions to retrieve prompts from an external system as part of the Hamilton dataflow. At each invocation, they will query for the prompt to use (you can of course cache this for performance):# prompt_template_loaders.py\\n\\ndef summarize_chunk_of_text_prompt(\\n  db_client: Client, other_args: str) -> str:\\n    # pseudo code here, but you get the idea:\\n    _prompt = db_client.query( \\n         \"get latest prompt X from DB\", other_args)\\n    return _prompt\\n\\ndef summarize_text_from_summaries_prompt(\\n   db_client: Client, another_arg: str) -> str:\\n    # pseudo code here, but you get the idea:\\n    _prompt = db_client.query(\\n         \"get latest prompt Y from DB\", another_arg)\\n    return _promptDriver code:from hamilton import base, driver\\nimport prompt_template_loaders # <-- load this to provide prompt input\\nimport summarization_shortend\\n\\n# create driver\\ndr = (\\n    driver.Builder()\\n    .with_modules(\\n        prompt_template_loaders,# <-- Hamilton will call above functions\\n        summarization_sortened, \\n    )\\n    .build()\\n)\\n\\n# execute, and pass in the prompt \\nresult = dr.execute(\\n   [\"summarized_text\"],\\n   inputs={\\n      # don\\'t need to pass prompts in this version\\n   }\\n)How would I log prompts used and monitor flows?Here we outline a few ways to monitor what went on.Log results of execution. That is run Hamilton, then emit information to wherever you want it to go.result = dr.execute(\\n   [\"summarized_text\", \\n    \"summarize_chunk_of_text_prompt\",   \\n    ... # and anything else you want to pull out\\n    \"summarize_text_from_summaries_prompt\"],\\n   inputs={\\n      # don\\'t need to pass prompts in this version\\n   }\\n)\\n\\nmy_log_system(result) # send what you want for safe keeping to some\\n                      # system that you own.Note. In the above, Hamilton allows you to requestanyintermediateoutputs simply by requesting “functions” (i.e. nodes in the diagram) by name. If we really want to get all the intermediate outputs of the entire dataflow, we can do so and log it wherever we want to!Use loggers inside Hamilton functions (to see the power of this approach,see my old talk on structured logs):import logging\\n\\nlogger = logging.getLogger(__name__)\\n\\ndef summarize_text_from_summaries_prompt(\\n    db_client: Client, another_arg: str) -> str:\\n    # pseudo code here, but you get the idea:\\n    _prompt = db_client.query(\\n         \"get latest prompt Y from DB\", another_arg)\\n    logger.info(f\"Prompt used is [{_prompt}]\")\\n    return _promptExtend Hamilton to emit this information. You use Hamilton to capture information from executed functions, i.e. nodes, without needing to insert logging statement inside the function’s body. This promotes reusability since you can toggle logging between development and production settings at the Driver level. SeeGraphAdapters, or write your ownPython decoratorto wrap functions for monitoring.In any of the above code, you could easily pull in a 3rd party tool to help track & monitor the code, as well as the external API call, e.g. data dog. Note, with a one-line code change, you can plug in the DAGWorks’s Driver and get all that monitoring you’d want and more. (Try the free tierhere)!Prompts as codePrompts as static stringsSince prompts are simply strings, they’re also very amenable to being stored along with your source code. The idea is to store as many prompt versions as you like, within your code so that at runtime, the set of prompts available is fixed and deterministic.The MLOps analogy here is, instead of dynamically reloading models, you instead bake the ML model into the container/hard code the reference. Once deployed, your app has everything that it needs. The deployment is immutable; nothing changes once it’s up. This makes debugging & determining what’s going on, much simpler.MLOps Analogy: make an immutable deployment by making the model fixed for your app’s deployment.Diagram showing how treating prompts as code enables you to leverage your CI/CD and build an immutable deployment for talking to your LLM API.This approach has many operational benefits:Whenever a new prompt is pushed, it forces a new deployment. Rollback semantics are clear if there’s an issue with a new prompt.You can submit a pull request (PR) for the source code and prompts at the same time. It becomes simpler to review what the change is, and the downstream dependencies of what these prompts will touch/interact with.You can add checks to your CI/CD system to ensure bad prompts don’t make it to production.It’s simpler to debug an issue. You just pull the (Docker) container that was created and you’ll be able to exactly replicate any customer issue quickly and easily.There is no other “prompt system” to maintain or manage. Simplifying operations.It doesn’t preclude adding extra monitoring and visibility.How it would work with HamiltonThe prompts would be encoded into functions into the dataflow/directed acyclic graph (DAG):What summarization.py in the PDF summarizer example looks like. The prompt templates are part of the code. Diagram created via Hamilton.Pairing this code withgit, we have a lightweight versioning system for your entire dataflow (i.e. “chain”), so you can always discern what state the world was in, given a git commit SHA. If you want to manage and have access to multiple prompts at any given point in time, Hamilton has two powerful abstractions to enable you to do so:@config.whenandPython modules. This allows you to store and keep available all older prompt versions together and specify which one to use via code.@config.when (docs)Hamilton has a concept of decorators, which are just annotations on functions. The@config.whendecorator allows to specify alternative implementations for a functions, i.e. “node”, in your dataflow. In this case, we specify alternative prompts.from hamilton.function_modifiers import config\\n\\n@config.when(version=\"v1\")\\ndef summarize_chunk_of_text_prompt__v1() -> str:\\n    \"\"\"V1 prompt for summarizing chunks of text.\"\"\"\\n    return f\"Summarize this text. Extract any key points with reasoning.\\\\n\\\\nContent:\"\\n\\n@config.when(version=\"v2\")\\ndef summarize_chunk_of_text_prompt__v2(content_type: str = \"an academic paper\") -> str:\\n    \"\"\"V2 prompt for summarizing chunks of text.\"\"\"\\n    return f\"Summarize this text from {content_type}. Extract the key points with reasoning. \\\\n\\\\nContent:\"You can keep adding functions annotated with@config.when, allowing you to swap between them using configuration passed to the HamiltonDriver. When instantiating theDriver,  it will construct the dataflow using the prompt implementation associated with the configuration value.from hamilton import base, driver\\nimport summarization\\n\\n# create driver\\ndr = (\\n    driver.Builder()\\n    .with_modules(summarization)\\n    .with_config({\"version\": \"v1\"}) # V1 is chosen. Use \"v2\\' for V2.\\n    .build()\\n)Module switchingAlternatively to using@config.when, you can instead place your different prompt implementations into different Python modules. Then, atDriverconstruction time, pass the correct module for the context you want to use.So here we have one module housing V1 of our prompt:# prompts_v1.py\\ndef summarize_chunk_of_text_prompt() -> str:\\n    \"\"\"V1 prompt for summarizing chunks of text.\"\"\"\\n    return f\"Summarize this text. Extract any key points with reasoning.\\\\n\\\\nContent:\"Here we have one module housing V2 (see how they differ slightly):# prompts_v2.py\\ndef summarize_chunk_of_text_prompt(content_type: str = \"an academic paper\") -> str:\\n    \"\"\"V2 prompt for summarizing chunks of text.\"\"\"\\n    return f\"Summarize this text from {content_type}. Extract the key points with reasoning. \\\\n\\\\nContent:\"In the driver code below, we choose the right module to use based on some context.# run.py\\nfrom hamilton import driver\\nimport summarization\\nimport prompts_v1\\nimport prompts_v2\\n\\n# create driver -- passing in the right module we want\\ndr = (\\n    driver.Builder()\\n    .with_modules(\\n        prompts_v1,  # or prompts_v2\\n        summarization,\\n    )\\n    .build()\\n)Using the module approach allows us to encapsulate and version whole sets of prompts together. If you want to go back in time (via git), or see what a blessed prompt version was, you just need to navigate to the correct commit, and then look in the right module.How would I log prompts used and monitor flows?Assuming you’re using git to track your code, you wouldn’t need to record what prompts were being used. Instead, you’d just need to know what git commit SHA is deployed and you’ll be able to track the version of your code and prompts simultaneously.To monitor flows, just like the above approach, you have the same monitoring hooks available at your disposal, and I wont repeat them here, but they are:Request any intermediate outputs and log them yourself outside of Hamilton.Log them from within the function yourself, or build aPython decorator/GraphAdapterto do it at the framework level.Integrate 3rd party tooling for monitoring your code and LLM API calls, or use the DAGWorks Platform offering to monitor it all. (Try the free tierhere)!or all the above!What about A/B testing my prompts?With any ML initiative, it’s important to measure business impacts of changes. Likewise, with LLMs + prompts, it’ll be important to test and measure changes against important business metrics. In the MLOps world, you’d be A/B testing ML models to evaluate their business value by dividing traffic between them. To ensure the randomness necessary to A/B tests, you wouldn’t know at runtime which model to use until a coin is flipped. However, to get those models out, they both would have follow a process to qualify them. So for prompts, we should think similarly.The above two prompt engineering patterns don’t preclude you from being able to A/B test prompts, but it means you need to manage a process to enable however many prompt templates you’re testing in parallel. If you’re also adjusting code paths, having them in code will be simpler to discern and debug what is going on, and you can make use of the `@config.when` decorator / python module swapping for this purpose. Versus, having to critically rely on your logging/monitoring/observability stack to tell you what prompt was used if you’re dynamically loading/passing them in and then having to mentally map which prompts go with which code paths.Note, this all gets harder if you start needing to change multiple prompts for an A/B test because you have several of them in a flow. For example you have two prompts in your workflow and you’re changing LLMs, you’ll want to A/B test the change holistically, rather than individually per prompt. Our advice, by putting the prompts into code your operational life will be simpler, since you’ll know what two prompts belong to what code paths without having to do any mental mapping.Thank you for reading DAGWorks’s Substack. This post is public so feel free to share it.ShareSummaryIn this post, we explained two patterns for managing prompts in a production environment with Hamilton. The first approach treatsprompts asdynamic runtime variables,while the second, treatsprompts as codefor production settings. If you value reducing operational burden, then our advice is to encode prompts as code, as it is operationally simpler, unless the speed to change them really matters for you.To recap:Prompts as dynamic runtime variables. Use an external system to pass the prompts to your Hamilton dataflows, or use Hamilton to pull them from a DB. For debugging & monitoring, it’s important to be able to determine what prompt was used for a given invocation. You can integrate open source tools, or use something like the DAGWorks Platform to help ensure you know what was used for any invocation of your code.Prompts as code.Encoding the prompts as code allows easy versioning with git. Change management can be done via pull requests and CI/CD checks. It works well with Hamilton’s features like@config.whenand module switching at the Driver level because it determines clearly what version of the prompt is used. This approach strengthens the use of any tooling you might use to monitor or track, like the DAGWorks Platform, as prompts for a deployment are immutable.We want to hear from you!If you’re excited by any of this, or have strong opinions, leave a comment, or drop by our Slack channel! Some links to do praise/complain/chat:📣join our community on Slack—\\u200awe’re more than happy to help answer questions you might have or get you started.⭐️ us onGitHub.📝 leave us anissueif you find something.📚 read ourdocumentation.⌨️ interactivelylearn about Hamilton in your browser.Other Hamilton posts you might be interested in:We have a growing collection of posts & content. Here are some we think you might be interested in.Containerized PDF Summarizer with FastAPI and HamiltonThierry Jean,DAGWorks Inc., andStefan Krawczyk·Aug 18Skip learning convoluted LLM-specific frameworks and write your first LLM application using regular Python functions and Hamilton! In this post, we’ll present a containerized PDF summarizer powered by the OpenAI API. Its flow is encoded in Hamilton, which theRead full storyBuilding a maintainable and modular LLM application stack with HamiltonThierry JeanandDAGWorks Inc.·Jul 11In this post, we’re going to share how Hamilton can help you write modular and maintainable code for your large language model (LLM) application stack. Hamilton is great for describing any type of dataflow, which is exactly what you’re doing when building an LLM powered application. With Hamilton you get strongRead full storytryhamilton.dev– an interactive tutorial in your browser!Hamilton + Airflow(GitHub repo)Hamilton + Feast(GitHub repo)Pandas data transformations in Hamilton in 5 minutesLineage + Hamilton in 10 minutes Thanks for reading DAGWorks’s Substack! Subscribe for free to receive updates and posts like this!Subscribe Thanks for reading DAGWorks’s Substack! Subscribe for free to receive updates and posts like this!Subscribe Thanks for reading DAGWorks’s Substack! Subscribe for free to receive updates and posts like this! Subscribe Subscribe Subscribe Subscribe       Point:Prompts + LLM APIs are analogous to hyper-parameters + machine learning models.          MLOps Analogy: diagram showing how ML model auto reloading would look.   Diagram showing what dynamically reloading/querying prompts would look like.      summarization_shortened.py. Note the two inputs “*_prompt” that denote prompts that are now required as input to the dataflow to function. With Hamilton you’ll be able to determine what inputs should be required for your prompt template by just looking at a diagram like this. Diagram created via Hamilton.            MLOps Analogy: make an immutable deployment by making the model fixed for your app’s deployment.   Diagram showing how treating prompts as code enables you to leverage your CI/CD and build an immutable deployment for talking to your LLM API.      What summarization.py in the PDF summarizer example looks like. The prompt templates are part of the code. Diagram created via Hamilton.               Thank you for reading DAGWorks’s Substack. This post is public so feel free to share it.Share Thank you for reading DAGWorks’s Substack. This post is public so feel free to share it.          Containerized PDF Summarizer with FastAPI and HamiltonThierry Jean,DAGWorks Inc., andStefan Krawczyk·Aug 18Skip learning convoluted LLM-specific frameworks and write your first LLM application using regular Python functions and Hamilton! In this post, we’ll present a containerized PDF summarizer powered by the OpenAI API. Its flow is encoded in Hamilton, which theRead full story Containerized PDF Summarizer with FastAPI and HamiltonThierry Jean,DAGWorks Inc., andStefan Krawczyk·Aug 18Skip learning convoluted LLM-specific frameworks and write your first LLM application using regular Python functions and Hamilton! In this post, we’ll present a containerized PDF summarizer powered by the OpenAI API. Its flow is encoded in Hamilton, which theRead full story Thierry Jean,DAGWorks Inc., andStefan Krawczyk·Aug 18 Thierry Jean,DAGWorks Inc., andStefan Krawczyk · Aug 18  Skip learning convoluted LLM-specific frameworks and write your first LLM application using regular Python functions and Hamilton! In this post, we’ll present a containerized PDF summarizer powered by the OpenAI API. Its flow is encoded in Hamilton, which the Read full story Read full story Building a maintainable and modular LLM application stack with HamiltonThierry JeanandDAGWorks Inc.·Jul 11In this post, we’re going to share how Hamilton can help you write modular and maintainable code for your large language model (LLM) application stack. Hamilton is great for describing any type of dataflow, which is exactly what you’re doing when building an LLM powered application. With Hamilton you get strongRead full story Building a maintainable and modular LLM application stack with HamiltonThierry JeanandDAGWorks Inc.·Jul 11In this post, we’re going to share how Hamilton can help you write modular and maintainable code for your large language model (LLM) application stack. Hamilton is great for describing any type of dataflow, which is exactly what you’re doing when building an LLM powered application. With Hamilton you get strongRead full story Thierry JeanandDAGWorks Inc.·Jul 11 Thierry JeanandDAGWorks Inc. · Jul 11  In this post, we’re going to share how Hamilton can help you write modular and maintainable code for your large language model (LLM) application stack. Hamilton is great for describing any type of dataflow, which is exactly what you’re doing when building an LLM powered application. With Hamilton you get strong Read full story Read full story  4Share this postLLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.ioCopy linkFacebookEmailNoteOtherShare 4Share this postLLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.ioCopy linkFacebookEmailNoteOtherShare 4Share this postLLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.ioCopy linkFacebookEmailNoteOther 4Share this postLLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.ioCopy linkFacebookEmailNoteOther 4 Share this postLLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.ioCopy linkFacebookEmailNoteOther Share this postLLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.ioCopy linkFacebookEmailNoteOther Share this postLLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.ioCopy linkFacebookEmailNoteOther Share this postLLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.ioCopy linkFacebookEmailNoteOther Share this postLLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.ioCopy linkFacebookEmailNoteOther Share this post LLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.ioCopy linkFacebookEmailNoteOther LLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.io  LLMOps: Production prompt engineering patterns with Hamiltonblog.dagworks.io blog.dagworks.io Copy linkFacebookEmailNoteOther  Copy link  Facebook  Email  Note  Other Share Share Comments Comments Comments      TopNewNo posts TopNewNo posts  TopNewNo posts TopNewNo posts TopNew TopNew Top New  No posts  Ready for more?Subscribe Ready for more?Subscribe Subscribe Subscribe Subscribe Subscribe    © 2023 DAGWorks Inc.Privacy∙Terms∙Collection noticeStart WritingGet the appSubstackis the home for great writing  © 2023 DAGWorks Inc.Privacy∙Terms∙Collection noticeStart WritingGet the appSubstackis the home for great writing © 2023 DAGWorks Inc.Privacy∙Terms∙Collection noticeStart WritingGet the appSubstackis the home for great writing © 2023 DAGWorks Inc.Privacy∙Terms∙Collection notice © 2023 DAGWorks Inc. Privacy∙Terms∙Collection notice Start WritingGet the app Substackis the home for great writing   This site requires JavaScript to run correctly. Pleaseturn on JavaScriptor unblock scripts'"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "res[\"parsed_html_collection\"][0].parsed"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
